Github user sujith71955 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r224987867 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/AbstractRawColumnChunk.java --- @@ -129,4 +138,35 @@ public void setDataChunkV3(DataChunk3 dataChunkV3) { public void setMinMaxFlagArray(boolean[] minMaxFlagArray) { this.minMaxFlagArray = minMaxFlagArray; } + public boolean isAdaptiveForDictionary() { --- End diff -- please add comments for public methods for better maintainability --- |
In reply to this post by qiuchenjian-2
Github user sujith71955 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r224987956 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/AbstractPrimitiveDimColumnPage.java --- @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.chunk.impl; + +import org.apache.carbondata.core.datastore.chunk.DimensionColumnPage; +import org.apache.carbondata.core.datastore.page.ColumnPage; +import org.apache.carbondata.core.metadata.blocklet.PresenceMeta; + +public abstract class AbstractPrimitiveDimColumnPage implements DimensionColumnPage { --- End diff -- please add comments class level and public method level --- |
In reply to this post by qiuchenjian-2
Github user sujith71955 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r224988211 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/AbstractRawColumnChunk.java --- @@ -47,6 +48,14 @@ private DataChunk3 dataChunkV3; + private PresenceMeta[] presenceMeta; + + private boolean isLVSeperated; + + private boolean isAdaptiveForDictionary; --- End diff -- is it required for both dictionary and nodictionary? since its an abstract class we can just add a property to know whether its adaptive or not in base class ? the child is anyways DimensionRawColumnChunk --- |
In reply to this post by qiuchenjian-2
Github user sujith71955 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r224988369 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/DimensionColumnPage.java --- @@ -97,21 +96,19 @@ * @param compareValue value to compare * @return compare result */ - int compareTo(int rowId, byte[] compareValue); + int compareTo(int rowId, Object compareValue); /** * below method will be used to free the allocated memory */ void freeMemory(); - /** - * to check whether the page is adaptive encoded - */ boolean isAdaptiveEncoded(); /** - * to get the null bit sets in case of adaptive encoded page + * method return presence meta which represents + * rowid for which values are null or not null + * @return */ - BitSet getNullBits(); - + PresenceMeta getPresentMeta(); --- End diff -- This method purpose is not clear, seems to be PresenceMeta is not very informative as per the comment which you provided on the method --- |
In reply to this post by qiuchenjian-2
Github user kunal642 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225037897 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/AbstractRawColumnChunk.java --- @@ -47,6 +48,14 @@ private DataChunk3 dataChunkV3; + private PresenceMeta[] presenceMeta; + + private boolean isLVSeperated; + + private boolean isAdaptiveForDictionary; --- End diff -- I agree isAdaptiveForDictionary and isAdaptiveForNoDictionary should be in DimensionRawColumnChunk --- |
In reply to this post by qiuchenjian-2
Github user kunal642 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225037399 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java --- @@ -261,6 +261,14 @@ public void readFields(DataInput in) throws IOException { this.doInvertedIndex = dimension.isUseInvertedIndex(); } + DimensionSpec(String columnName, DataType dataType, ColumnType columnType) { + super(columnName, dataType, columnType); + } + + public static DimensionSpec newInstance(String columnName, DataType dataType, --- End diff -- As you are exposing newInstance() method to create the object then both the constructors can be made private. I think it is better to add newInstance() for the old constructor also. Same goes for MeasureSpec --- |
In reply to this post by qiuchenjian-2
Github user kunal642 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225038241 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/impl/AbstractDimensionColumnPage.java --- @@ -107,4 +98,12 @@ protected boolean isExplicitSorted(int[] invertedIndex) { @Override public boolean isNoDicitionaryColumn() { return false; } + + @Override public boolean isAdaptiveEncoded() { + return false; + } + + @Override public PresenceMeta getPresentMeta() { + return null; + } --- End diff -- Only Method definition is moved below. Please revert this change --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2729 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/789/ --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225084679 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java --- @@ -17,71 +17,117 @@ package org.apache.carbondata.core.datastore.page.encoding.dimension.legacy; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Map; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorage; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorageForNoInvertedIndexForShort; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorageForShort; +import org.apache.carbondata.core.datastore.columnar.BinaryPageIndexGenerator; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; import org.apache.carbondata.core.datastore.compression.Compressor; import org.apache.carbondata.core.datastore.compression.CompressorFactory; import org.apache.carbondata.core.datastore.page.ColumnPage; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.util.ByteUtil; import org.apache.carbondata.format.Encoding; +/** + * Codec class for binary/String data type columns + */ public class HighCardDictDimensionIndexCodec extends IndexStorageCodec { - /** - * whether this column is varchar data type(long string) - */ - private boolean isVarcharType; - public HighCardDictDimensionIndexCodec(boolean isSort, boolean isInvertedIndex, - boolean isVarcharType) { - super(isSort, isInvertedIndex); - this.isVarcharType = isVarcharType; + private final List<Encoding> encodingList; + + public HighCardDictDimensionIndexCodec(boolean isSort) { + super(isSort); + encodingList = new ArrayList<>(); + encodingList.add(Encoding.DIRECT_STRING); } @Override public String getName() { return "HighCardDictDimensionIndexCodec"; } - @Override - public ColumnPageEncoder createEncoder(Map<String, String> parameter) { - return new IndexStorageEncoder() { - + @Override public ColumnPageEncoder createEncoder(Map<String, Object> parameter) { + return new IndexStorageEncoder(true, null, encodingList) { + private final int THREE_BYTES_MAX = (int) Math.pow(2, 23) - 1; + private final int THREE_BYTES_MIN = - THREE_BYTES_MAX - 1; @Override - protected void encodeIndexStorage(ColumnPage input) { - BlockIndexerStorage<byte[][]> indexStorage; + protected void encodeIndexStorage(ColumnPage input) throws MemoryException, IOException { + PageIndexGenerator<byte[][]> pageIndexGenerator; + // get actual data byte[][] data = input.getByteArrayPage(); - boolean isDictionary = input.isLocalDictGeneratedPage(); - if (isInvertedIndex) { - indexStorage = new BlockIndexerStorageForShort(data, isDictionary, !isDictionary, isSort); - } else { - indexStorage = - new BlockIndexerStorageForNoInvertedIndexForShort(data, isDictionary); + // fill length array + int[] lengthArray = new int[data.length]; + int max = Integer.MIN_VALUE; + int min = Integer.MAX_VALUE; + int currentDataLength; + int size = 0; + for (int i = 0; i < lengthArray.length; i++) { + currentDataLength = data[i].length; + lengthArray[i] = currentDataLength; + size += currentDataLength; + if (max < currentDataLength) { + max = currentDataLength; + } + if (min > currentDataLength) { + min = currentDataLength; + } } - byte[] flattened = ByteUtil.flatten(indexStorage.getDataPage()); Compressor compressor = CompressorFactory.getInstance().getCompressor( input.getColumnCompressorName()); + pageIndexGenerator = + new BinaryPageIndexGenerator(data, isSort, lengthArray); + // free memory + selectedDataType = fitLongMinMax(max, min); + byte[][] dataPage = pageIndexGenerator.getDataPage(); + ByteBuffer byteBuffer; + if (DataTypes.BYTE == selectedDataType) { + byteBuffer = ByteBuffer.allocate(lengthArray.length + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.put((byte) lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } else if (DataTypes.SHORT == selectedDataType) { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 2) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.putShort((short) lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } else if (DataTypes.SHORT_INT == selectedDataType) { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 3) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.put(ByteUtil.to3Bytes(lengthArray[i])); + byteBuffer.put(dataPage[i]); + } + } else { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 4) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.putInt(lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } + byteBuffer.rewind(); + byte[] flattened = byteBuffer.array(); super.compressedDataPage = compressor.compressByte(flattened); - super.indexStorage = indexStorage; + super.pageIndexGenerator = pageIndexGenerator; } - @Override - protected List<Encoding> getEncodingList() { - List<Encoding> encodings = new ArrayList<>(); - if (isVarcharType) { - encodings.add(Encoding.DIRECT_COMPRESS_VARCHAR); - } else if (indexStorage.getRowIdPageLengthInBytes() > 0) { - encodings.add(Encoding.INVERTED_INDEX); - } - if (indexStorage.getDataRlePageLengthInBytes() > 0) { - encodings.add(Encoding.RLE); + private DataType fitLongMinMax(int max, int min) { --- End diff -- Better move this method to encoding strategy --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225087082 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/PrimitiveTypeColumnCodec.java --- @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.page.encoding.dimension.legacy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.carbondata.core.datastore.TableSpec; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; +import org.apache.carbondata.core.datastore.columnar.PrimitivePageIndexGenerator; +import org.apache.carbondata.core.datastore.page.ColumnPage; +import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; +import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoderMeta; +import org.apache.carbondata.core.datastore.page.encoding.DefaultEncodingFactory; +import org.apache.carbondata.core.datastore.page.statistics.PrimitivePageStatsCollector; +import org.apache.carbondata.core.keygenerator.KeyGenerator; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.util.ByteUtil; +import org.apache.carbondata.format.Encoding; + +/** + * Codec class for primitive data type columns + */ +public class PrimitiveTypeColumnCodec extends IndexStorageCodec { --- End diff -- EIther rename the class which should tell the work it does like primitiveinvertedINdex codec or primitivedimensioncodec --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225088520 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/IndexStorageEncoder.java --- @@ -20,43 +20,86 @@ import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; import org.apache.carbondata.core.constants.CarbonCommonConstants; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorage; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; import org.apache.carbondata.core.datastore.page.ColumnPage; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoderMeta; +import org.apache.carbondata.core.datastore.page.encoding.EncodedColumnPage; +import org.apache.carbondata.core.keygenerator.KeyGenerator; import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.util.DataTypeUtil; +import org.apache.carbondata.format.BlockletMinMaxIndex; import org.apache.carbondata.format.DataChunk2; +import org.apache.carbondata.format.Encoding; import org.apache.carbondata.format.SortState; public abstract class IndexStorageEncoder extends ColumnPageEncoder { - BlockIndexerStorage indexStorage; + /** + * index generator + */ + PageIndexGenerator pageIndexGenerator; + /** + * compressed data + */ byte[] compressedDataPage; - abstract void encodeIndexStorage(ColumnPage inputPage); + /** + * encoded data page, in case of + */ + EncodedColumnPage encodedColumnPage; + + /** + * whether to store offset for column data + */ + private boolean storeOffset; --- End diff -- Please correct the name to appropriate. Bettter name as `isAdaptiveOnLength` --- |
In reply to this post by qiuchenjian-2
Github user manishgupta88 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225088657 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/IndexStorageEncoder.java --- @@ -66,26 +109,76 @@ return result; } + @Override protected ColumnPageEncoderMeta getEncoderMeta(ColumnPage inputPage) { return null; } @Override - protected void fillLegacyFields(DataChunk2 dataChunk) - throws IOException { - SortState sort = (indexStorage.getRowIdPageLengthInBytes() > 0) ? + protected List<ByteBuffer> buildEncoderMeta(ColumnPage inputPage) throws IOException { + if (this.storeOffset) { + List<ByteBuffer> metaDatas = new ArrayList<>(); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(stream); + out.writeByte(selectedDataType.getId()); + metaDatas.add(ByteBuffer.wrap(stream.toByteArray())); + return metaDatas; --- End diff -- We can remove stream usage here and directly write the byte to ByteBuffer --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2729 Build Success with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9054/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2729 Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/986/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2729 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/790/ --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225135466 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/IndexStorageEncoder.java --- @@ -66,26 +109,76 @@ return result; } + @Override protected ColumnPageEncoderMeta getEncoderMeta(ColumnPage inputPage) { return null; } @Override - protected void fillLegacyFields(DataChunk2 dataChunk) - throws IOException { - SortState sort = (indexStorage.getRowIdPageLengthInBytes() > 0) ? + protected List<ByteBuffer> buildEncoderMeta(ColumnPage inputPage) throws IOException { + if (this.storeOffset) { + List<ByteBuffer> metaDatas = new ArrayList<>(); + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(stream); + out.writeByte(selectedDataType.getId()); + metaDatas.add(ByteBuffer.wrap(stream.toByteArray())); + return metaDatas; --- End diff -- Ok --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225135490 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/IndexStorageEncoder.java --- @@ -20,43 +20,86 @@ import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; import org.apache.carbondata.core.constants.CarbonCommonConstants; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorage; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; import org.apache.carbondata.core.datastore.page.ColumnPage; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoderMeta; +import org.apache.carbondata.core.datastore.page.encoding.EncodedColumnPage; +import org.apache.carbondata.core.keygenerator.KeyGenerator; import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.util.DataTypeUtil; +import org.apache.carbondata.format.BlockletMinMaxIndex; import org.apache.carbondata.format.DataChunk2; +import org.apache.carbondata.format.Encoding; import org.apache.carbondata.format.SortState; public abstract class IndexStorageEncoder extends ColumnPageEncoder { - BlockIndexerStorage indexStorage; + /** + * index generator + */ + PageIndexGenerator pageIndexGenerator; + /** + * compressed data + */ byte[] compressedDataPage; - abstract void encodeIndexStorage(ColumnPage inputPage); + /** + * encoded data page, in case of + */ + EncodedColumnPage encodedColumnPage; + + /** + * whether to store offset for column data + */ + private boolean storeOffset; --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225135542 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/PrimitiveTypeColumnCodec.java --- @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.page.encoding.dimension.legacy; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.carbondata.core.datastore.TableSpec; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; +import org.apache.carbondata.core.datastore.columnar.PrimitivePageIndexGenerator; +import org.apache.carbondata.core.datastore.page.ColumnPage; +import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; +import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoderMeta; +import org.apache.carbondata.core.datastore.page.encoding.DefaultEncodingFactory; +import org.apache.carbondata.core.datastore.page.statistics.PrimitivePageStatsCollector; +import org.apache.carbondata.core.keygenerator.KeyGenerator; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; +import org.apache.carbondata.core.util.ByteUtil; +import org.apache.carbondata.format.Encoding; + +/** + * Codec class for primitive data type columns + */ +public class PrimitiveTypeColumnCodec extends IndexStorageCodec { --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2729#discussion_r225135585 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/dimension/legacy/HighCardDictDimensionIndexCodec.java --- @@ -17,71 +17,117 @@ package org.apache.carbondata.core.datastore.page.encoding.dimension.legacy; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Map; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorage; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorageForNoInvertedIndexForShort; -import org.apache.carbondata.core.datastore.columnar.BlockIndexerStorageForShort; +import org.apache.carbondata.core.datastore.columnar.BinaryPageIndexGenerator; +import org.apache.carbondata.core.datastore.columnar.PageIndexGenerator; import org.apache.carbondata.core.datastore.compression.Compressor; import org.apache.carbondata.core.datastore.compression.CompressorFactory; import org.apache.carbondata.core.datastore.page.ColumnPage; import org.apache.carbondata.core.datastore.page.encoding.ColumnPageEncoder; +import org.apache.carbondata.core.memory.MemoryException; +import org.apache.carbondata.core.metadata.datatype.DataType; +import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.util.ByteUtil; import org.apache.carbondata.format.Encoding; +/** + * Codec class for binary/String data type columns + */ public class HighCardDictDimensionIndexCodec extends IndexStorageCodec { - /** - * whether this column is varchar data type(long string) - */ - private boolean isVarcharType; - public HighCardDictDimensionIndexCodec(boolean isSort, boolean isInvertedIndex, - boolean isVarcharType) { - super(isSort, isInvertedIndex); - this.isVarcharType = isVarcharType; + private final List<Encoding> encodingList; + + public HighCardDictDimensionIndexCodec(boolean isSort) { + super(isSort); + encodingList = new ArrayList<>(); + encodingList.add(Encoding.DIRECT_STRING); } @Override public String getName() { return "HighCardDictDimensionIndexCodec"; } - @Override - public ColumnPageEncoder createEncoder(Map<String, String> parameter) { - return new IndexStorageEncoder() { - + @Override public ColumnPageEncoder createEncoder(Map<String, Object> parameter) { + return new IndexStorageEncoder(true, null, encodingList) { + private final int THREE_BYTES_MAX = (int) Math.pow(2, 23) - 1; + private final int THREE_BYTES_MIN = - THREE_BYTES_MAX - 1; @Override - protected void encodeIndexStorage(ColumnPage input) { - BlockIndexerStorage<byte[][]> indexStorage; + protected void encodeIndexStorage(ColumnPage input) throws MemoryException, IOException { + PageIndexGenerator<byte[][]> pageIndexGenerator; + // get actual data byte[][] data = input.getByteArrayPage(); - boolean isDictionary = input.isLocalDictGeneratedPage(); - if (isInvertedIndex) { - indexStorage = new BlockIndexerStorageForShort(data, isDictionary, !isDictionary, isSort); - } else { - indexStorage = - new BlockIndexerStorageForNoInvertedIndexForShort(data, isDictionary); + // fill length array + int[] lengthArray = new int[data.length]; + int max = Integer.MIN_VALUE; + int min = Integer.MAX_VALUE; + int currentDataLength; + int size = 0; + for (int i = 0; i < lengthArray.length; i++) { + currentDataLength = data[i].length; + lengthArray[i] = currentDataLength; + size += currentDataLength; + if (max < currentDataLength) { + max = currentDataLength; + } + if (min > currentDataLength) { + min = currentDataLength; + } } - byte[] flattened = ByteUtil.flatten(indexStorage.getDataPage()); Compressor compressor = CompressorFactory.getInstance().getCompressor( input.getColumnCompressorName()); + pageIndexGenerator = + new BinaryPageIndexGenerator(data, isSort, lengthArray); + // free memory + selectedDataType = fitLongMinMax(max, min); + byte[][] dataPage = pageIndexGenerator.getDataPage(); + ByteBuffer byteBuffer; + if (DataTypes.BYTE == selectedDataType) { + byteBuffer = ByteBuffer.allocate(lengthArray.length + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.put((byte) lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } else if (DataTypes.SHORT == selectedDataType) { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 2) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.putShort((short) lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } else if (DataTypes.SHORT_INT == selectedDataType) { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 3) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.put(ByteUtil.to3Bytes(lengthArray[i])); + byteBuffer.put(dataPage[i]); + } + } else { + byteBuffer = ByteBuffer.allocate((lengthArray.length * 4) + size); + for (int i = 0; i < lengthArray.length; i++) { + byteBuffer.putInt(lengthArray[i]); + byteBuffer.put(dataPage[i]); + } + } + byteBuffer.rewind(); + byte[] flattened = byteBuffer.array(); super.compressedDataPage = compressor.compressByte(flattened); - super.indexStorage = indexStorage; + super.pageIndexGenerator = pageIndexGenerator; } - @Override - protected List<Encoding> getEncodingList() { - List<Encoding> encodings = new ArrayList<>(); - if (isVarcharType) { - encodings.add(Encoding.DIRECT_COMPRESS_VARCHAR); - } else if (indexStorage.getRowIdPageLengthInBytes() > 0) { - encodings.add(Encoding.INVERTED_INDEX); - } - if (indexStorage.getDataRlePageLengthInBytes() > 0) { - encodings.add(Encoding.RLE); + private DataType fitLongMinMax(int max, int min) { --- End diff -- ok --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2729 Build Success with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9055/ --- |
Free forum by Nabble | Edit this page |