Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1184/ --- |
In reply to this post by qiuchenjian-2
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r227620200 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/AbstractNonDictionaryVectorFiller.java --- @@ -48,7 +48,11 @@ public int getLengthFromBuffer(ByteBuffer buffer) { public static AbstractNonDictionaryVectorFiller getVectorFiller(DataType type, int lengthSize, int numberOfRows) { if (type == DataTypes.STRING) { - return new StringVectorFiller(lengthSize, numberOfRows); + if (lengthSize > 2) { --- End diff -- 2 is magic number, can you change to constant or add a function to make it more readable --- |
In reply to this post by qiuchenjian-2
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r227620364 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/BlockletScannedResult.java --- @@ -145,6 +147,8 @@ protected QueryStatisticsModel queryStatisticsModel; + protected LazyBlockletLoad lazyBlockletLoad; --- End diff -- Actually I am confused with the name xxxLoad, why is it called load? I am wondering is there a common name for this technique used in presto? --- |
In reply to this post by qiuchenjian-2
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r227620665 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/vector/impl/directread/AbstractCarbonColumnarVector.java --- @@ -130,4 +131,9 @@ public CarbonColumnVector getDictionaryVector() { public void convert() { // Do nothing } + + @Override + public void setLazyPage(LazyPageLoad lazyPage) { + throw new UnsupportedOperationException("Not allowed from here"); --- End diff -- Put the class name in the message, it is easier for debugging --- |
In reply to this post by qiuchenjian-2
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r227620816 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/scanner/LazyBlockletLoad.java --- @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.scan.scanner; + +import java.io.IOException; + +import org.apache.carbondata.core.datastore.FileReader; +import org.apache.carbondata.core.datastore.chunk.AbstractRawColumnChunk; +import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk; +import org.apache.carbondata.core.datastore.chunk.impl.MeasureRawColumnChunk; +import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo; +import org.apache.carbondata.core.scan.processor.RawBlockletColumnChunks; +import org.apache.carbondata.core.stats.QueryStatistic; +import org.apache.carbondata.core.stats.QueryStatisticsConstants; +import org.apache.carbondata.core.stats.QueryStatisticsModel; + +/** + * Reads the blocklet column chunks lazily, it means it reads the column chunks from disk when + * execution engine wants to access it. + * It is useful in case of filter queries with high cardinality columns. + */ +public class LazyBlockletLoad { + + private RawBlockletColumnChunks rawBlockletColumnChunks; + + private BlockExecutionInfo blockExecutionInfo; + + private LazyChunkWrapper[] dimLazyWrapperChunks; + + private LazyChunkWrapper[] msrLazyWrapperChunks; --- End diff -- can we unify the processing of `dimLazyWrapperChunks` and `msrLazyWrapperChunks` so that we can use one flow for them? --- |
In reply to this post by qiuchenjian-2
Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r227621527 --- Diff: integration/spark-datasource/src/main/spark2.1andspark2.2/org/apache/spark/sql/CarbonVectorProxy.java --- @@ -150,127 +140,189 @@ public void reset() { columnarBatch.reset(); } - public void putRowToColumnBatch(int rowId, Object value, int offset) { - org.apache.spark.sql.types.DataType t = dataType(offset); - if (null == value) { - putNull(rowId, offset); - } else { - if (t == org.apache.spark.sql.types.DataTypes.BooleanType) { - putBoolean(rowId, (boolean) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.ByteType) { - putByte(rowId, (byte) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.ShortType) { - putShort(rowId, (short) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.IntegerType) { - putInt(rowId, (int) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.LongType) { - putLong(rowId, (long) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.FloatType) { - putFloat(rowId, (float) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.DoubleType) { - putDouble(rowId, (double) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.StringType) { - UTF8String v = (UTF8String) value; - putByteArray(rowId, v.getBytes(), offset); - } else if (t instanceof org.apache.spark.sql.types.DecimalType) { - DecimalType dt = (DecimalType) t; - Decimal d = Decimal.fromDecimal(value); - if (dt.precision() <= Decimal.MAX_INT_DIGITS()) { - putInt(rowId, (int) d.toUnscaledLong(), offset); - } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) { - putLong(rowId, d.toUnscaledLong(), offset); - } else { - final BigInteger integer = d.toJavaBigDecimal().unscaledValue(); - byte[] bytes = integer.toByteArray(); - putByteArray(rowId, bytes, 0, bytes.length, offset); + + public static class ColumnVectorProxy { + + private ColumnVector vector; + + public ColumnVectorProxy(ColumnarBatch columnarBatch, int ordinal) { + this.vector = columnarBatch.column(ordinal); + } + + public void putRowToColumnBatch(int rowId, Object value, int offset) { + org.apache.spark.sql.types.DataType t = dataType(offset); --- End diff -- It seems the offset param is not used in dataType, and please change the function name of dataType --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r228096779 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/store/impl/safe/AbstractNonDictionaryVectorFiller.java --- @@ -48,7 +48,11 @@ public int getLengthFromBuffer(ByteBuffer buffer) { public static AbstractNonDictionaryVectorFiller getVectorFiller(DataType type, int lengthSize, int numberOfRows) { if (type == DataTypes.STRING) { - return new StringVectorFiller(lengthSize, numberOfRows); + if (lengthSize > 2) { --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r228099219 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/BlockletScannedResult.java --- @@ -145,6 +147,8 @@ protected QueryStatisticsModel queryStatisticsModel; + protected LazyBlockletLoad lazyBlockletLoad; --- End diff -- In presto they call as lazy read. I will rename as LazyBlockletLoader to give better understanding --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r228099605 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/vector/impl/directread/AbstractCarbonColumnarVector.java --- @@ -130,4 +131,9 @@ public CarbonColumnVector getDictionaryVector() { public void convert() { // Do nothing } + + @Override + public void setLazyPage(LazyPageLoad lazyPage) { + throw new UnsupportedOperationException("Not allowed from here"); --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r228100832 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/scanner/LazyBlockletLoad.java --- @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.scan.scanner; + +import java.io.IOException; + +import org.apache.carbondata.core.datastore.FileReader; +import org.apache.carbondata.core.datastore.chunk.AbstractRawColumnChunk; +import org.apache.carbondata.core.datastore.chunk.impl.DimensionRawColumnChunk; +import org.apache.carbondata.core.datastore.chunk.impl.MeasureRawColumnChunk; +import org.apache.carbondata.core.scan.executor.infos.BlockExecutionInfo; +import org.apache.carbondata.core.scan.processor.RawBlockletColumnChunks; +import org.apache.carbondata.core.stats.QueryStatistic; +import org.apache.carbondata.core.stats.QueryStatisticsConstants; +import org.apache.carbondata.core.stats.QueryStatisticsModel; + +/** + * Reads the blocklet column chunks lazily, it means it reads the column chunks from disk when + * execution engine wants to access it. + * It is useful in case of filter queries with high cardinality columns. + */ +public class LazyBlockletLoad { + + private RawBlockletColumnChunks rawBlockletColumnChunks; + + private BlockExecutionInfo blockExecutionInfo; + + private LazyChunkWrapper[] dimLazyWrapperChunks; + + private LazyChunkWrapper[] msrLazyWrapperChunks; --- End diff -- we cannot at this level. Unifying of dimension and measure should be done from the bottom layer. It will take a lot of refactoring. May we can target in future versions. --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2823#discussion_r228106032 --- Diff: integration/spark-datasource/src/main/spark2.1andspark2.2/org/apache/spark/sql/CarbonVectorProxy.java --- @@ -150,127 +140,189 @@ public void reset() { columnarBatch.reset(); } - public void putRowToColumnBatch(int rowId, Object value, int offset) { - org.apache.spark.sql.types.DataType t = dataType(offset); - if (null == value) { - putNull(rowId, offset); - } else { - if (t == org.apache.spark.sql.types.DataTypes.BooleanType) { - putBoolean(rowId, (boolean) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.ByteType) { - putByte(rowId, (byte) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.ShortType) { - putShort(rowId, (short) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.IntegerType) { - putInt(rowId, (int) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.LongType) { - putLong(rowId, (long) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.FloatType) { - putFloat(rowId, (float) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.DoubleType) { - putDouble(rowId, (double) value, offset); - } else if (t == org.apache.spark.sql.types.DataTypes.StringType) { - UTF8String v = (UTF8String) value; - putByteArray(rowId, v.getBytes(), offset); - } else if (t instanceof org.apache.spark.sql.types.DecimalType) { - DecimalType dt = (DecimalType) t; - Decimal d = Decimal.fromDecimal(value); - if (dt.precision() <= Decimal.MAX_INT_DIGITS()) { - putInt(rowId, (int) d.toUnscaledLong(), offset); - } else if (dt.precision() <= Decimal.MAX_LONG_DIGITS()) { - putLong(rowId, d.toUnscaledLong(), offset); - } else { - final BigInteger integer = d.toJavaBigDecimal().unscaledValue(); - byte[] bytes = integer.toByteArray(); - putByteArray(rowId, bytes, 0, bytes.length, offset); + + public static class ColumnVectorProxy { + + private ColumnVector vector; + + public ColumnVectorProxy(ColumnarBatch columnarBatch, int ordinal) { + this.vector = columnarBatch.column(ordinal); + } + + public void putRowToColumnBatch(int rowId, Object value, int offset) { + org.apache.spark.sql.types.DataType t = dataType(offset); --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1228/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1015/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9280/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1027/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1239/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9292/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1240/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Failed with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9293/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2823 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1028/ --- |
Free forum by Nabble | Edit this page |