Apache CarbonData Dev Mailing List archive › Apache CarbonData JIRA issues

[GitHub] carbondata pull request #2654: [WIP] Adaptive Encoding for Primitive data ty...

Classic

List

193 messages Options

Options

1 ... 345678910

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214341135

--- Diff: processing/src/main/java/org/apache/carbondata/processing/sort/sortdata/IntermediateSortTempRowComparator.java ---
@@ -45,18 +52,31 @@ public int compare(IntermediateSortTempRow rowA, IntermediateSortTempRow rowB) {
int diff = 0;
int dictIndex = 0;
int nonDictIndex = 0;
+ int noDicTypeIdx = 0;

for (boolean isNoDictionary : isSortColumnNoDictionary) {

if (isNoDictionary) {
- byte[] byteArr1 = rowA.getNoDictSortDims()[nonDictIndex];
- byte[] byteArr2 = rowB.getNoDictSortDims()[nonDictIndex];
- nonDictIndex++;
+ if (DataTypeUtil.isPrimitiveColumn(noDicSortDataTypes[noDicTypeIdx])) {
+ // use data types based comparator for the no dictionary measure columns
+ SerializableComparator comparator = org.apache.carbondata.core.util.comparator.Comparator
--- End diff --

Increment the no dictionary type index here `noDicTypeIdx` in if block and not at the end

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214341442

--- Diff: processing/src/main/java/org/apache/carbondata/processing/sort/sortdata/NewRowComparator.java ---
@@ -43,15 +53,31 @@ public NewRowComparator(boolean[] noDictionarySortColumnMaping) {
public int compare(Object[] rowA, Object[] rowB) {
int diff = 0;
int index = 0;
+ int dataTypeIdx = 0;
+ int noDicSortIdx = 0;

- for (boolean isNoDictionary : noDictionarySortColumnMaping) {
- if (isNoDictionary) {
- byte[] byteArr1 = (byte[]) rowA[index];
- byte[] byteArr2 = (byte[]) rowB[index];
+ for (int i = 0; i < noDicDimColMapping.length; i++) {
+ if (noDicDimColMapping[i]) {
+ if (noDicSortColumnMapping[noDicSortIdx++]) {
+ if (DataTypeUtil.isPrimitiveColumn(noDicDataTypes[dataTypeIdx])) {
+ // use data types based comparator for the no dictionary measure columns
+ SerializableComparator comparator =
--- End diff --

increment `dataTypeIdx` in if block and remove from method end

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214341633

--- Diff: processing/src/main/java/org/apache/carbondata/processing/loading/partition/impl/RawRowComparator.java ---
@@ -30,24 +33,39 @@
public class RawRowComparator implements Comparator<CarbonRow> {
private int[] sortColumnIndices;
private boolean[] isSortColumnNoDict;
+ private DataType[] noDicDataTypes;

- public RawRowComparator(int[] sortColumnIndices, boolean[] isSortColumnNoDict) {
+ public RawRowComparator(int[] sortColumnIndices, boolean[] isSortColumnNoDict,
+ DataType[] noDicDataTypes) {
this.sortColumnIndices = sortColumnIndices;
this.isSortColumnNoDict = isSortColumnNoDict;
+ this.noDicDataTypes = noDicDataTypes;
}

@Override
public int compare(CarbonRow o1, CarbonRow o2) {
int diff = 0;
int i = 0;
+ int noDicIdx = 0;
for (int colIdx : sortColumnIndices) {
if (isSortColumnNoDict[i]) {
- byte[] colA = (byte[]) o1.getObject(colIdx);
- byte[] colB = (byte[]) o2.getObject(colIdx);
- diff = UnsafeComparer.INSTANCE.compareTo(colA, colB);
- if (diff != 0) {
- return diff;
+ if (DataTypeUtil.isPrimitiveColumn(noDicDataTypes[noDicIdx])) {
+ // for no dictionary numeric column get comparator based on the data type
+ SerializableComparator comparator = org.apache.carbondata.core.util.comparator.Comparator
--- End diff --

increment `noDicIdx` in if block and remove from method end

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214341815

--- Diff: processing/src/main/java/org/apache/carbondata/processing/loading/sort/unsafe/comparator/UnsafeRowComparator.java ---
@@ -60,26 +64,50 @@ public int compare(UnsafeCarbonRow rowL, Object baseObjectL, UnsafeCarbonRow row
if (isNoDictionary) {
short lengthA = CarbonUnsafe.getUnsafe().getShort(baseObjectL,
rowA + dictSizeInMemory + sizeInNonDictPartA);
- byte[] byteArr1 = new byte[lengthA];
sizeInNonDictPartA += 2;
- CarbonUnsafe.getUnsafe()
- .copyMemory(baseObjectL, rowA + dictSizeInMemory + sizeInNonDictPartA,
- byteArr1, CarbonUnsafe.BYTE_ARRAY_OFFSET, lengthA);
- sizeInNonDictPartA += lengthA;
-
short lengthB = CarbonUnsafe.getUnsafe().getShort(baseObjectR,
rowB + dictSizeInMemory + sizeInNonDictPartB);
- byte[] byteArr2 = new byte[lengthB];
sizeInNonDictPartB += 2;
- CarbonUnsafe.getUnsafe()
- .copyMemory(baseObjectR, rowB + dictSizeInMemory + sizeInNonDictPartB,
- byteArr2, CarbonUnsafe.BYTE_ARRAY_OFFSET, lengthB);
- sizeInNonDictPartB += lengthB;
+ DataType dataType = tableFieldStat.getNoDicSortDataType()[noDicSortIdx];
+ if (DataTypeUtil.isPrimitiveColumn(dataType)) {
+ Object data1 = null;
--- End diff --

increment `noDicSortIdx` in if block and remove from method end

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214336180

--- Diff: processing/src/main/java/org/apache/carbondata/processing/loading/sort/SortStepRowHandler.java ---
@@ -224,10 +237,15 @@ public IntermediateSortTempRow readWithNoSortFieldConvert(

// read no-dict & sort data
for (int idx = 0; idx < this.noDictSortDimCnt; idx++) {
- short len = inputStream.readShort();
- byte[] bytes = new byte[len];
- inputStream.readFully(bytes);
- noDictSortDims[idx] = bytes;
+ // for no dict measure column get the original data
+ if (DataTypeUtil.isPrimitiveColumn(noDicSortDataTypes[idx])) {
+ noDictSortDims[idx] = readDataFromStream(inputStream, idx);
+ } else {
+ short len = inputStream.readShort();
+ byte[] bytes = new byte[len];
+ inputStream.readFully(bytes);
+ noDictSortDims[idx] = bytes;
+ }
--- End diff --

Above also there is a similar..refactor the code to one method and call from both these places

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder1/8209/

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/138/

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214351650

--- Diff: core/src/main/java/org/apache/carbondata/core/datastore/TableSpec.java ---
@@ -91,6 +92,30 @@ private void addMeasures(List<CarbonMeasure> measures) {
}
}

+ /**
+ * No dictionary and complex dimensions of the table
+ *
+ * @return
+ */
+ public DimensionSpec[] getNoDictAndComplexDimensions() {
+ List<Integer> noDicOrCompIndexes = new ArrayList<>(dimensionSpec.length);
+ int noDicCount = 0;
+ for (int i = 0; i < dimensionSpec.length; i++) {
+ if (dimensionSpec[i].getColumnType() == ColumnType.PLAIN_VALUE
+ || dimensionSpec[i].getColumnType() == ColumnType.COMPLEX_PRIMITIVE
+ || dimensionSpec[i].getColumnType() == ColumnType.COMPLEX) {
+ noDicOrCompIndexes.add(i);
+ noDicCount++;
+ }
+ }
+
+ DimensionSpec[] dims = new DimensionSpec[noDicCount];
+ for (int i = 0; i < dims.length; i++) {
+ dims[i] = dimensionSpec[noDicOrCompIndexes.get(i)];
+ }
+ return dims;
--- End diff --

Avoid the below for loop in this method

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214352965

--- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java ---
@@ -346,12 +371,21 @@ static ColumnPageCodec selectCodecByAlgorithmForDecimal(SimpleStatsResult stats,
// no effect to use adaptive or delta, use compression only
return new DirectCompressCodec(stats.getDataType());
}
+ boolean isSort = false;
+ boolean isInvertedIndex = false;
+ if (columnSpec instanceof TableSpec.DimensionSpec
+ && columnSpec.getColumnType() != ColumnType.COMPLEX_PRIMITIVE) {
+ isSort = ((TableSpec.DimensionSpec) columnSpec).isInSortColumns();
+ isInvertedIndex = isSort && ((TableSpec.DimensionSpec) columnSpec).isDoInvertedIndex();
+ }
--- End diff --

Put the above changes in one method as the same code is used in above places also and then call this method while creating the encoding type

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214354541

--- Diff: processing/src/main/java/org/apache/carbondata/processing/store/TablePage.java ---
@@ -363,7 +398,16 @@ public EncodedTablePage getEncodedTablePage() {
columnPageEncoder = encodingFactory.createEncoder(
spec,
noDictDimensionPages[noDictIndex]);
- encodedPage = columnPageEncoder.encode(noDictDimensionPages[noDictIndex++]);
+ encodedPage = columnPageEncoder.encode(noDictDimensionPages[noDictIndex]);
+ DataType targetDataType =
+ columnPageEncoder.getTargetDataType(noDictDimensionPages[noDictIndex]);
+ if (null != targetDataType) {
+ LOGGER.info("Encoder result ---> Source data type: " + noDictDimensionPages[noDictIndex]
--- End diff --

make this logger debug and check for debugenabled

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214356896

--- Diff: core/src/main/java/org/apache/carbondata/core/datastore/chunk/reader/dimension/v3/CompressedDimensionChunkFileBasedReaderV3.java ---
@@ -239,12 +239,25 @@ private boolean isEncodedWithMeta(DataChunk2 pageMetadata) {
protected DimensionColumnPage decodeDimension(DimensionRawColumnChunk rawColumnPage,
ByteBuffer pageData, DataChunk2 pageMetadata, int offset)
throws IOException, MemoryException {
+ List<Encoding> encodings = pageMetadata.getEncoders();
if (isEncodedWithMeta(pageMetadata)) {
ColumnPage decodedPage = decodeDimensionByMeta(pageMetadata, pageData, offset,
null != rawColumnPage.getLocalDictionary());
decodedPage.setNullBits(QueryUtil.getNullBitSet(pageMetadata.presence));
- return new ColumnPageWrapper(decodedPage, rawColumnPage.getLocalDictionary(),
- isEncodedWithAdaptiveMeta(pageMetadata));
+ int[] invertedIndexes = new int[0];
--- End diff --

add a comment to explain that this scenario is to handle no dictionary primitive type columns where inverted index can be created on row id's during data load

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214361007

--- Diff: core/src/main/java/org/apache/carbondata/core/scan/filter/executer/IncludeFilterExecuterImpl.java ---
@@ -110,8 +112,19 @@ public BitSetGroup applyFilter(RawBlockletColumnChunks rawBlockletColumnChunks,
boolean isDecoded = false;
for (int i = 0; i < dimensionRawColumnChunk.getPagesCount(); i++) {
if (dimensionRawColumnChunk.getMaxValues() != null) {
- if (isScanRequired(dimensionRawColumnChunk.getMaxValues()[i],
- dimensionRawColumnChunk.getMinValues()[i], dimColumnExecuterInfo.getFilterKeys())) {
+ boolean scanRequired;
+ // for no dictionary measure column comparison can be done
+ // on the original data as like measure column
+ if (DataTypeUtil.isPrimitiveColumn(dimColumnEvaluatorInfo.getDimension().getDataType())
+ && !dimColumnEvaluatorInfo.getDimension().hasEncoding(Encoding.DICTIONARY)) {
+ scanRequired = isScanRequired(dimensionRawColumnChunk.getMaxValues()[i],
--- End diff --

You can create a `isPrimitiveNoDictionaryColumn` flag and check `DataTypeUtil.isPrimitiveColum` in the constructor. This will avoid the check for every page. Do this for all the filters

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214371546

--- Diff: datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMap.java ---
@@ -331,8 +332,18 @@ private BloomQueryModel buildQueryModelInternal(CarbonColumn carbonColumn,
// for dictionary/date columns, convert the surrogate key to bytes
internalFilterValue = CarbonUtil.getValueAsBytes(DataTypes.INT, convertedValue);
} else {
- // for non dictionary dimensions, is already bytes,
- internalFilterValue = (byte[]) convertedValue;
+ // for non dictionary dimensions, numeric columns will be of original data,
+ // so convert the data to bytes
+ if (DataTypeUtil.isPrimitiveColumn(carbonColumn.getDataType())) {
+ if (convertedValue == null) {
--- End diff --

if possible initialize and store the flag in constructor and remove the check `DataTypeUtil.isPrimitiveColumn` wherever applicable in the below code

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user manishgupta88 commented on the issue:

https://github.com/apache/carbondata/pull/2654

@dhatchayani
You can raise one more to improvise the code at some places:
1. Unify isScanRequired code in all the filter classes using ENUM and flag based on min max comparison
2. Create new page wrapper that extends from ColumnPageWrapper and sends the actual data for no dictionary primitive type columns

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder1/8214/

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/143/

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user kumarvishal09 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214504899

--- Diff: core/src/main/java/org/apache/carbondata/core/datastore/columnar/BlockIndexerStorage.java ---
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.core.datastore.columnar;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+
+public class BlockIndexerStorage {
+
+ /**
+ * It compresses depends up on the sequence numbers.
+ * [1,2,3,4,6,8,10,11,12,13] is translated to [1,4,6,8,10,13] and [0,6]. In
+ * first array the start and end of sequential numbers and second array
+ * keeps the indexes of where sequential numbers starts. If there is no
+ * sequential numbers then the same array it returns with empty second
+ * array.
+ *
+ * @param rowIds
+ */
+ public static Map<String, short[]> rleEncodeOnRowId(short[] rowIds, short[] rowIdPage,
--- End diff --

move this code to carbonutil

---

[GitHub] carbondata pull request #2654: [CARBONDATA-2896] Adaptive Encoding for Primi...

In reply to this post by qiuchenjian-2

Github user kumarvishal09 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2654#discussion_r214504900

--- Diff: core/src/main/java/org/apache/carbondata/core/datastore/columnar/BlockIndexerStorage.java ---
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.core.datastore.columnar;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+
+public class BlockIndexerStorage {
+
+ /**
+ * It compresses depends up on the sequence numbers.
+ * [1,2,3,4,6,8,10,11,12,13] is translated to [1,4,6,8,10,13] and [0,6]. In
+ * first array the start and end of sequential numbers and second array
+ * keeps the indexes of where sequential numbers starts. If there is no
+ * sequential numbers then the same array it returns with empty second
+ * array.
+ *
+ * @param rowIds
+ */
+ public static Map<String, short[]> rleEncodeOnRowId(short[] rowIds, short[] rowIdPage,
+ short[] rowIdRlePage) {
+ List<Short> list = new ArrayList<Short>(CarbonCommonConstants.CONSTANT_SIZE_TEN);
+ List<Short> map = new ArrayList<Short>(CarbonCommonConstants.CONSTANT_SIZE_TEN);
+ int k = 0;
+ int i = 1;
+ for (; i < rowIds.length; i++) {
+ if (rowIds[i] - rowIds[i - 1] == 1) {
+ k++;
+ } else {
+ if (k > 0) {
+ map.add(((short) list.size()));
+ list.add(rowIds[i - k - 1]);
+ list.add(rowIds[i - 1]);
+ } else {
+ list.add(rowIds[i - 1]);
+ }
+ k = 0;
+ }
+ }
+ if (k > 0) {
+ map.add(((short) list.size()));
+ list.add(rowIds[i - k - 1]);
+ list.add(rowIds[i - 1]);
+ } else {
+ list.add(rowIds[i - 1]);
+ }
+ int compressionPercentage = (((list.size() + map.size()) * 100) / rowIds.length);
+ if (compressionPercentage > 70) {
+ rowIdPage = rowIds;
+ } else {
+ rowIdPage = convertToArray(list);
+ }
+ if (rowIds.length == rowIdPage.length) {
+ rowIdRlePage = new short[0];
+ } else {
+ rowIdRlePage = convertToArray(map);
+ }
+ Map<String, short[]> rowIdAndRowRleIdPages = new HashMap<>(2);
+ rowIdAndRowRleIdPages.put("rowIdPage", rowIdPage);
+ rowIdAndRowRleIdPages.put("rowRlePage", rowIdRlePage);
+ return rowIdAndRowRleIdPages;
+ }
+
+ public static short[] convertToArray(List<Short> list) {
--- End diff --

move this code to carbonutil

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder1/8221/

---

[GitHub] carbondata issue #2654: [CARBONDATA-2896] Adaptive Encoding for Primitive da...

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2654

Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/150/

---

1 ... 345678910