Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196633906 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/DefaultEncodingFactory.java --- @@ -103,6 +103,7 @@ private ColumnPageEncoder createEncoderForDimensionLegacy(TableSpec.DimensionSpe return new HighCardDictDimensionIndexCodec( dimensionSpec.isInSortColumns(), --- End diff -- emm, better not do this in this PR. All the parameters for *IndexCodec looks alike. Changing all of them will introduce unrelated changes. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196634217 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java --- @@ -71,7 +72,7 @@ public ColumnPageDecoder createDecoder(List<Encoding> encodings, List<ByteBuffer byte[] encoderMeta = encoderMetas.get(0).array(); ByteArrayInputStream stream = new ByteArrayInputStream(encoderMeta); DataInputStream in = new DataInputStream(stream); - if (encoding == DIRECT_COMPRESS) { + if (encoding == DIRECT_COMPRESS || encoding == DIRECT_COMPRESS_VARCHAR) { --- End diff -- `DIRECT_COMPRESS_VARCHAR` is a special encoding to distinguish normal string and long string. It is used in CompressedDimensionChunkFileBasedReaderV3 to indicate the DimensionStoreType. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196634573 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/compress/DirectCompressCodec.java --- @@ -64,7 +64,7 @@ public ColumnPageDecoder createDecoder(ColumnPageEncoderMeta meta) { return new DirectDecompressor(meta); } - private static class DirectCompressor extends ColumnPageEncoder { --- End diff -- Yeah, it is required because in the method `getEncodingList`, we want to use member `datatype` from the outside class. If it is static inner class, we cannot access that member. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196634976 --- Diff: core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java --- @@ -205,7 +205,7 @@ public void setNumberOfPages(int numberOfPages) { output.writeLong(dimensionOffset); output.writeLong(measureOffsets); int dsize = dimensionChunkOffsets != null ? dimensionChunkOffsets.size() : 0; - output.writeShort(dsize); + output.writeInt(dsize); --- End diff -- OK~ --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196634986 --- Diff: core/src/main/java/org/apache/carbondata/core/metadata/blocklet/BlockletInfo.java --- @@ -268,7 +268,7 @@ private DataChunk deserializeDataChunk(byte[] bytes) throws IOException { @Override public void readFields(DataInput input) throws IOException { dimensionOffset = input.readLong(); measureOffsets = input.readLong(); - short dimensionChunkOffsetsSize = input.readShort(); + int dimensionChunkOffsetsSize = input.readInt(); --- End diff -- OK~ --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196635418 --- Diff: integration/spark-common/src/main/scala/org/apache/spark/sql/catalyst/CarbonDDLSqlParser.scala --- @@ -279,7 +279,7 @@ abstract class CarbonDDLSqlParser extends AbstractCarbonSparkSQLParser { fields.zipWithIndex.foreach { case (field, index) => field.schemaOrdinal = index } - val (dims, msrs, noDictionaryDims, sortKeyDims) = extractDimAndMsrFields( + val (dims, msrs, noDictionaryDims, sortKeyDims, varcharColumns) = extractDimAndMsrFields( --- End diff -- Just like the other results of `extractDimAndMsrFields`, we validate and get the sort_column, dictionaries and the varcharColumns(longStringColumns). For the varcharColumns, we change their datatype from string to varchar later. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196635615 --- Diff: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java --- @@ -1601,6 +1602,8 @@ // As Short data type is used for storing the length of a column during data processing hence // the maximum characters that can be supported should be less than Short max value public static final int MAX_CHARS_PER_COLUMN_DEFAULT = 32000; + // todo: use infinity first, will switch later + public static final int MAX_CHARS_PER_COLUMN_INFINITY = -1; --- End diff -- As I mentioned in another PR, better not to introduce this limits. -1 means that the parser can parse infinity characters. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196636059 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/VarLengthColumnPageBase.java --- @@ -289,6 +289,12 @@ public void putDouble(int rowId, double value) { @Override public void putBytes(int rowId, byte[] bytes) { + // rowId * 4 represents the length of L in LV + if (bytes.length > (Integer.MAX_VALUE - totalLength - rowId * 4)) { --- End diff -- I come across a new idea: During parsing/converting, we can calculate #numberOfRowsPerPage * #currentCharacterLength, if it is larger than 2GB, dataload will fail. Notice that the #numberOfRowsPerPage is specified by user through configurations. If this is OK, I'll implementation in the future not this PR. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196637400 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/statistics/LVLongStringStatsCollector.java --- @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.datastore.page.statistics; + +/** + * This class is for the columns with varchar data type, + * a string type which can hold more than 32000 characters + */ +public class LVLongStringStatsCollector extends LVStringStatsCollector { + + public static LVLongStringStatsCollector newInstance() { + return new LVLongStringStatsCollector(); + } + + private LVLongStringStatsCollector() { + + } + + @Override + protected byte[] getActualValue(byte[] value) { + byte[] actualValue; + assert (value.length >= 4); + if (value.length == 4) { + assert (value[0] == 0 && value[1] == 0); + actualValue = new byte[0]; + } else { + // todo: what does this mean? + // int length = (value[0] << 8) + (value[1] & 0xff); --- End diff -- yeah, I find a more readable way to fix it. --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on the issue:
https://github.com/apache/carbondata/pull/2379 Rebased with the latest master branch. The second commit is to fix the review comments. --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2379 Build Failed with Spark 2.2.1, Please check CI http://88.99.58.216:8080/job/ApacheCarbonPRBuilder/5230/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2379 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder1/6396/ --- |
In reply to this post by qiuchenjian-2
Github user ravipesala commented on the issue:
https://github.com/apache/carbondata/pull/2379 SDV Build Success , Please check CI http://144.76.159.231:8080/job/ApacheSDVTests/5342/ --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2379#discussion_r196654276 --- Diff: core/src/main/java/org/apache/carbondata/core/datastore/page/encoding/EncodingFactory.java --- @@ -71,7 +72,7 @@ public ColumnPageDecoder createDecoder(List<Encoding> encodings, List<ByteBuffer byte[] encoderMeta = encoderMetas.get(0).array(); ByteArrayInputStream stream = new ByteArrayInputStream(encoderMeta); DataInputStream in = new DataInputStream(stream); - if (encoding == DIRECT_COMPRESS) { + if (encoding == DIRECT_COMPRESS || encoding == DIRECT_COMPRESS_VARCHAR) { --- End diff -- @xuchuanyin I think in your last PR it was VARCHAR... --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on the issue:
https://github.com/apache/carbondata/pull/2379 retest this please --- |
In reply to this post by qiuchenjian-2
Github user xuchuanyin commented on the issue:
https://github.com/apache/carbondata/pull/2379 retest it please --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2379 Build Success with Spark 2.2.1, Please check CI http://88.99.58.216:8080/job/ApacheCarbonPRBuilder/5239/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2379 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder1/6405/ --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on the issue:
https://github.com/apache/carbondata/pull/2379 @xuchuanyin It is difficult for the user to configure number of rows in a page as it depends on data, its better to handle internally based on size of data divide the page in multiple pages in Page Producer if it crossing 2gb. Please raise a JIRA and handle this in different PR. and Complex type is also not handled please raise JIRA for the same...I will merge this PR --- |
In reply to this post by qiuchenjian-2
Github user kumarvishal09 commented on the issue:
https://github.com/apache/carbondata/pull/2379 LGTM --- |
Free forum by Nabble | Edit this page |