Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228202864 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonRecordReader.java --- @@ -116,6 +116,20 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context) return readSupport.readRow(carbonIterator.next()); } + /** + * get batch result + * + * @param batch batch size + * @return rows + */ + public List<Object[]> getBatchValue(int batch) { + rowCount += batch; --- End diff -- we may not get complete batch size data from RowBatch.nextBatch(), it can return lesser data also. So, incrementing rowcount as batchsize is wrong. we need to increment this value same as returned rows from batch iterator. --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228206123 --- Diff: store/CSDK/CarbonReader.cpp --- @@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) { return carbonReaderBuilderObject; } +jobject CarbonReader::withBatch(int batch) { + jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); + jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch", --- End diff -- Need to add validation. GetObjectClass, GetMethodID can return null. Also need to take java exception to cpp for CallObjectMethodA. --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228209499 --- Diff: store/CSDK/CarbonReader.cpp --- @@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) { return carbonReaderBuilderObject; } +jobject CarbonReader::withBatch(int batch) { + jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); + jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch", + "(I)Lorg/apache/carbondata/sdk/file/CarbonReaderBuilder;"); + + jvalue args[1]; + args[0].i = batch; + carbonReaderBuilderObject = jniEnv->CallObjectMethodA(carbonReaderBuilderObject, buildID, args); + return carbonReaderBuilderObject; +} + jobject CarbonReader::build() { jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "build", "()Lorg/apache/carbondata/sdk/file/CarbonReader;"); carbonReaderObject = jniEnv->CallObjectMethod(carbonReaderBuilderObject, buildID); + carbonReader = jniEnv->GetObjectClass(carbonReaderObject); + hasNextID = jniEnv->GetMethodID(carbonReader, "hasNext", "()Z"); --- End diff -- These filling can be moved to those functions only down. just have to check if not null or 0, then only fill. So that only one time we fill --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228213436 --- Diff: store/CSDK/main.cpp --- @@ -99,21 +102,187 @@ bool readFromLocalWithoutProjection(JNIEnv *env) { printf("%s\t", carbonRow.getDecimal(10)); printf("%f\t", carbonRow.getFloat(11)); printf("\n"); + env->DeleteLocalRef(row); + env->DeleteLocalRef(array1); } carbonReaderClass.close(); } +/** + * test next Row Performance + * + * @param env jni env + * @return + */ +bool testNextRowPerformance(JNIEnv *env, char *path, int printNum, char *argv[], int argc) { --- End diff -- Same comment, this is internal comparision. No need to add in example test case. --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228214955 --- Diff: store/CSDK/main.cpp --- @@ -224,8 +406,81 @@ bool readFromS3(JNIEnv *env, char *argv[]) { printf("%s\t", carbonRow.getDecimal(10)); printf("%f\t", carbonRow.getFloat(11)); printf("\n"); + env->DeleteLocalRef(row); + env->DeleteLocalRef(array1); + } + gettimeofday(&read, NULL); + time = 1000000 * (read.tv_sec - start.tv_sec) + read.tv_usec - start.tv_usec; + printf("total lines is %d: build time: %lf, read time is %lf s, average speed is %lf records/s\n", + i, buildTime, time / 1000000.0, i / (time / 1000000.0)); + + reader.close(); +} + +/** + * read data from S3 + * parameter is ak sk endpoint + * + * @param env jni env + * @param argv argument vector + * @return + */ +bool readPerformanceFromS3(JNIEnv *env, char *argv[]) { --- End diff -- all internal comparision performance test cases are not needed. --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228220014 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -1723,4 +1723,93 @@ public void testReadNextCarbonRowWithProjection() { } } + @Test + public void testReadNextBatchRow() { + String path = "./carbondata"; + try { + FileUtils.deleteDirectory(new File(path)); + + Field[] fields = new Field[12]; + fields[0] = new Field("stringField", DataTypes.STRING); + fields[1] = new Field("shortField", DataTypes.SHORT); + fields[2] = new Field("intField", DataTypes.INT); + fields[3] = new Field("longField", DataTypes.LONG); + fields[4] = new Field("doubleField", DataTypes.DOUBLE); + fields[5] = new Field("boolField", DataTypes.BOOLEAN); + fields[6] = new Field("dateField", DataTypes.DATE); + fields[7] = new Field("timeField", DataTypes.TIMESTAMP); + fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2)); + fields[9] = new Field("varcharField", DataTypes.VARCHAR); + fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING)); + fields[11] = new Field("floatField", DataTypes.FLOAT); + Map<String, String> map = new HashMap<>(); + map.put("complex_delimiter_level_1", "#"); + CarbonWriter writer = CarbonWriter.builder() + .outputPath(path) + .withLoadOptions(map) + .withCsvInput(new Schema(fields)).build(); + + for (int i = 0; i < 10; i++) { + String[] row2 = new String[]{ + "robot" + (i % 10), + String.valueOf(i % 10000), + String.valueOf(i), + String.valueOf(Long.MAX_VALUE - i), + String.valueOf((double) i / 2), + String.valueOf(true), + "2019-03-02", + "2019-02-12 03:03:34", + "12.345", + "varchar", + "Hello#World#From#Carbon", + "1.23" + }; + writer.write(row2); + } + writer.close(); + + // Read data + CarbonReader reader = CarbonReader + .builder(path, "_temp") --- End diff -- Need to set batch size and test ? withBatch(int) --- |
In reply to this post by qiuchenjian-2
Github user ajantha-bhat commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228221442 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -90,6 +91,18 @@ public T readNextRow() throws IOException, InterruptedException { return currentReader.getCurrentValue(); } + /** + * Read and return next batch row objects + */ + public Object[] readNextBatchRow(int batch) throws Exception { --- End diff -- Why do we need to pass batch size here ? It should return batch same as set from withBatch(int) ?? --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228386483 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -1616,7 +1616,6 @@ public boolean accept(File dir, String name) { assertEquals(RowUtil.getDouble(data, 4), ((double) i) / 2); assert (RowUtil.getBoolean(data, 5)); assertEquals(RowUtil.getInt(data, 6), 17957); - assertEquals(RowUtil.getLong(data, 7), 1549911814000000L); --- End diff -- It's different between local machine and CI machine. --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228390686 --- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java --- @@ -100,4 +100,24 @@ public int getSize() { counter++; return row; } + + /** + * read next batch + * + * @param batch batch size + * @return rows + */ + public List<Object[]> nextBatch(int batch) { + if (!hasNext()) { + throw new NoSuchElementException(); + } + List<Object[]> row; + if (counter + batch > rows.size()) { + row = rows.subList(counter, rows.size()); + } else { + row = rows.subList(counter, counter + batch); + } + counter = counter + batch; --- End diff -- ok, optimized --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228391972 --- Diff: examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderBenchmark.java --- @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.benchmark; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.util.CarbonProperties; +import org.apache.carbondata.sdk.file.*; + +/** + * Test SDK read performance + */ +public class SDKReaderBenchmark { --- End diff -- we need know the SDK reader performance, and then we can know JNI performance lose by comparing with CSDK performance. And @KanakaKumar told we should test SDK reader performance in 1.5.1. example has two benchmark currently, ConcurrentQueryBenchmark and SimpleQueryBenchmark。 If there are lack some test or indepth (IO time, pages scanned and all) or granular, we can add. --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228392301 --- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonRecordReader.java --- @@ -116,6 +116,20 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context) return readSupport.readRow(carbonIterator.next()); } + /** + * get batch result + * + * @param batch batch size + * @return rows + */ + public List<Object[]> getBatchValue(int batch) { + rowCount += batch; --- End diff -- ok, optimized --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2816 Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1036/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2816 Build Success with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9301/ --- |
In reply to this post by qiuchenjian-2
Github user CarbonDataQA commented on the issue:
https://github.com/apache/carbondata/pull/2816 Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1249/ --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228457234 --- Diff: store/CSDK/CarbonReader.cpp --- @@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) { return carbonReaderBuilderObject; } +jobject CarbonReader::withBatch(int batch) { + jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); + jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch", --- End diff -- GetObjectClass shouldn't null, we should check whether carbonReaderBuilderObject is null. checked GetMethodID. CallObjectMethodA of withBatch won't throw exception --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228458636 --- Diff: store/CSDK/CarbonReader.cpp --- @@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) { return carbonReaderBuilderObject; } +jobject CarbonReader::withBatch(int batch) { + jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); + jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch", + "(I)Lorg/apache/carbondata/sdk/file/CarbonReaderBuilder;"); + + jvalue args[1]; + args[0].i = batch; + carbonReaderBuilderObject = jniEnv->CallObjectMethodA(carbonReaderBuilderObject, buildID, args); + return carbonReaderBuilderObject; +} + jobject CarbonReader::build() { jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject); jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "build", "()Lorg/apache/carbondata/sdk/file/CarbonReader;"); carbonReaderObject = jniEnv->CallObjectMethod(carbonReaderBuilderObject, buildID); + carbonReader = jniEnv->GetObjectClass(carbonReaderObject); + hasNextID = jniEnv->GetMethodID(carbonReader, "hasNext", "()Z"); --- End diff -- this has been done in PR2792 --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228459548 --- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java --- @@ -1723,4 +1723,93 @@ public void testReadNextCarbonRowWithProjection() { } } + @Test + public void testReadNextBatchRow() { + String path = "./carbondata"; + try { + FileUtils.deleteDirectory(new File(path)); + + Field[] fields = new Field[12]; + fields[0] = new Field("stringField", DataTypes.STRING); + fields[1] = new Field("shortField", DataTypes.SHORT); + fields[2] = new Field("intField", DataTypes.INT); + fields[3] = new Field("longField", DataTypes.LONG); + fields[4] = new Field("doubleField", DataTypes.DOUBLE); + fields[5] = new Field("boolField", DataTypes.BOOLEAN); + fields[6] = new Field("dateField", DataTypes.DATE); + fields[7] = new Field("timeField", DataTypes.TIMESTAMP); + fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2)); + fields[9] = new Field("varcharField", DataTypes.VARCHAR); + fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING)); + fields[11] = new Field("floatField", DataTypes.FLOAT); + Map<String, String> map = new HashMap<>(); + map.put("complex_delimiter_level_1", "#"); + CarbonWriter writer = CarbonWriter.builder() + .outputPath(path) + .withLoadOptions(map) + .withCsvInput(new Schema(fields)).build(); + + for (int i = 0; i < 10; i++) { + String[] row2 = new String[]{ + "robot" + (i % 10), + String.valueOf(i % 10000), + String.valueOf(i), + String.valueOf(Long.MAX_VALUE - i), + String.valueOf((double) i / 2), + String.valueOf(true), + "2019-03-02", + "2019-02-12 03:03:34", + "12.345", + "varchar", + "Hello#World#From#Carbon", + "1.23" + }; + writer.write(row2); + } + writer.close(); + + // Read data + CarbonReader reader = CarbonReader + .builder(path, "_temp") --- End diff -- ok, done --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228461385 --- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java --- @@ -90,6 +91,18 @@ public T readNextRow() throws IOException, InterruptedException { return currentReader.getCurrentValue(); } + /** + * Read and return next batch row objects + */ + public Object[] readNextBatchRow(int batch) throws Exception { --- End diff -- ok, optimized. remove batch in readNextBatchRow and keep in withBatch --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228469192 --- Diff: store/CSDK/main.cpp --- @@ -224,8 +406,81 @@ bool readFromS3(JNIEnv *env, char *argv[]) { printf("%s\t", carbonRow.getDecimal(10)); printf("%f\t", carbonRow.getFloat(11)); printf("\n"); + env->DeleteLocalRef(row); + env->DeleteLocalRef(array1); + } + gettimeofday(&read, NULL); + time = 1000000 * (read.tv_sec - start.tv_sec) + read.tv_usec - start.tv_usec; + printf("total lines is %d: build time: %lf, read time is %lf s, average speed is %lf records/s\n", + i, buildTime, time / 1000000.0, i / (time / 1000000.0)); + + reader.close(); +} + +/** + * read data from S3 + * parameter is ak sk endpoint + * + * @param env jni env + * @param argv argument vector + * @return + */ +bool readPerformanceFromS3(JNIEnv *env, char *argv[]) { --- End diff -- I want to test for big data(more than 100 million rows), I change the test case name, is it ok? --- |
In reply to this post by qiuchenjian-2
Github user xubo245 commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/2816#discussion_r228469216 --- Diff: store/CSDK/main.cpp --- @@ -99,21 +102,187 @@ bool readFromLocalWithoutProjection(JNIEnv *env) { printf("%s\t", carbonRow.getDecimal(10)); printf("%f\t", carbonRow.getFloat(11)); printf("\n"); + env->DeleteLocalRef(row); + env->DeleteLocalRef(array1); } carbonReaderClass.close(); } +/** + * test next Row Performance + * + * @param env jni env + * @return + */ +bool testNextRowPerformance(JNIEnv *env, char *path, int printNum, char *argv[], int argc) { --- End diff -- I want to test for big data(more than 100 million rows), I change the test case name, is it ok? --- |
Free forum by Nabble | Edit this page |