Apache CarbonData Dev Mailing List archive › Apache CarbonData JIRA issues

[GitHub] carbondata pull request #2816: [CARBONDATA-300] Suppor read batch row in CSD...

Classic

List

168 messages Options

Options

123456 ... 9

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228202864

--- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonRecordReader.java ---
@@ -116,6 +116,20 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context)
return readSupport.readRow(carbonIterator.next());
}

+ /**
+ * get batch result
+ *
+ * @param batch batch size
+ * @return rows
+ */
+ public List<Object[]> getBatchValue(int batch) {
+ rowCount += batch;
--- End diff --

we may not get complete batch size data from RowBatch.nextBatch(), it can return lesser data also. So, incrementing rowcount as batchsize is wrong.
we need to increment this value same as returned rows from batch iterator.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228206123

--- Diff: store/CSDK/CarbonReader.cpp ---
@@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) {
return carbonReaderBuilderObject;
}

+jobject CarbonReader::withBatch(int batch) {
+ jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
+ jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch",
--- End diff --

Need to add validation. GetObjectClass, GetMethodID can return null.

Also need to take java exception to cpp for CallObjectMethodA.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228209499

--- Diff: store/CSDK/CarbonReader.cpp ---
@@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) {
return carbonReaderBuilderObject;
}

+jobject CarbonReader::withBatch(int batch) {
+ jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
+ jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch",
+ "(I)Lorg/apache/carbondata/sdk/file/CarbonReaderBuilder;");
+
+ jvalue args[1];
+ args[0].i = batch;
+ carbonReaderBuilderObject = jniEnv->CallObjectMethodA(carbonReaderBuilderObject, buildID, args);
+ return carbonReaderBuilderObject;
+}
+
jobject CarbonReader::build() {
jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "build",
"()Lorg/apache/carbondata/sdk/file/CarbonReader;");
carbonReaderObject = jniEnv->CallObjectMethod(carbonReaderBuilderObject, buildID);
+ carbonReader = jniEnv->GetObjectClass(carbonReaderObject);
+ hasNextID = jniEnv->GetMethodID(carbonReader, "hasNext", "()Z");
--- End diff --

These filling can be moved to those functions only down. just have to check if not null or 0, then only fill. So that only one time we fill

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228213436

--- Diff: store/CSDK/main.cpp ---
@@ -99,21 +102,187 @@ bool readFromLocalWithoutProjection(JNIEnv *env) {
printf("%s\t", carbonRow.getDecimal(10));
printf("%f\t", carbonRow.getFloat(11));
printf("\n");
+ env->DeleteLocalRef(row);
+ env->DeleteLocalRef(array1);
}

carbonReaderClass.close();
}

+/**
+ * test next Row Performance
+ *
+ * @param env jni env
+ * @return
+ */
+bool testNextRowPerformance(JNIEnv *env, char *path, int printNum, char *argv[], int argc) {
--- End diff --

Same comment, this is internal comparision. No need to add in example test case.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228214955

--- Diff: store/CSDK/main.cpp ---
@@ -224,8 +406,81 @@ bool readFromS3(JNIEnv *env, char *argv[]) {
printf("%s\t", carbonRow.getDecimal(10));
printf("%f\t", carbonRow.getFloat(11));
printf("\n");
+ env->DeleteLocalRef(row);
+ env->DeleteLocalRef(array1);
+ }
+ gettimeofday(&read, NULL);
+ time = 1000000 * (read.tv_sec - start.tv_sec) + read.tv_usec - start.tv_usec;
+ printf("total lines is %d: build time: %lf, read time is %lf s, average speed is %lf records/s\n",
+ i, buildTime, time / 1000000.0, i / (time / 1000000.0));
+
+ reader.close();
+}
+
+/**
+ * read data from S3
+ * parameter is ak sk endpoint
+ *
+ * @param env jni env
+ * @param argv argument vector
+ * @return
+ */
+bool readPerformanceFromS3(JNIEnv *env, char *argv[]) {
--- End diff --

all internal comparision performance test cases are not needed.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228220014

--- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -1723,4 +1723,93 @@ public void testReadNextCarbonRowWithProjection() {
}
}

+ @Test
+ public void testReadNextBatchRow() {
+ String path = "./carbondata";
+ try {
+ FileUtils.deleteDirectory(new File(path));
+
+ Field[] fields = new Field[12];
+ fields[0] = new Field("stringField", DataTypes.STRING);
+ fields[1] = new Field("shortField", DataTypes.SHORT);
+ fields[2] = new Field("intField", DataTypes.INT);
+ fields[3] = new Field("longField", DataTypes.LONG);
+ fields[4] = new Field("doubleField", DataTypes.DOUBLE);
+ fields[5] = new Field("boolField", DataTypes.BOOLEAN);
+ fields[6] = new Field("dateField", DataTypes.DATE);
+ fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
+ fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
+ fields[9] = new Field("varcharField", DataTypes.VARCHAR);
+ fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
+ fields[11] = new Field("floatField", DataTypes.FLOAT);
+ Map<String, String> map = new HashMap<>();
+ map.put("complex_delimiter_level_1", "#");
+ CarbonWriter writer = CarbonWriter.builder()
+ .outputPath(path)
+ .withLoadOptions(map)
+ .withCsvInput(new Schema(fields)).build();
+
+ for (int i = 0; i < 10; i++) {
+ String[] row2 = new String[]{
+ "robot" + (i % 10),
+ String.valueOf(i % 10000),
+ String.valueOf(i),
+ String.valueOf(Long.MAX_VALUE - i),
+ String.valueOf((double) i / 2),
+ String.valueOf(true),
+ "2019-03-02",
+ "2019-02-12 03:03:34",
+ "12.345",
+ "varchar",
+ "Hello#World#From#Carbon",
+ "1.23"
+ };
+ writer.write(row2);
+ }
+ writer.close();
+
+ // Read data
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
--- End diff --

Need to set batch size and test ? withBatch(int)

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228221442

--- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -90,6 +91,18 @@ public T readNextRow() throws IOException, InterruptedException {
return currentReader.getCurrentValue();
}

+ /**
+ * Read and return next batch row objects
+ */
+ public Object[] readNextBatchRow(int batch) throws Exception {
--- End diff --

Why do we need to pass batch size here ? It should return batch same as set from withBatch(int) ??

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228386483

--- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -1616,7 +1616,6 @@ public boolean accept(File dir, String name) {
assertEquals(RowUtil.getDouble(data, 4), ((double) i) / 2);
assert (RowUtil.getBoolean(data, 5));
assertEquals(RowUtil.getInt(data, 6), 17957);
- assertEquals(RowUtil.getLong(data, 7), 1549911814000000L);
--- End diff --

It's different between local machine and CI machine.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228390686

--- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java ---
@@ -100,4 +100,24 @@ public int getSize() {
counter++;
return row;
}
+
+ /**
+ * read next batch
+ *
+ * @param batch batch size
+ * @return rows
+ */
+ public List<Object[]> nextBatch(int batch) {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ List<Object[]> row;
+ if (counter + batch > rows.size()) {
+ row = rows.subList(counter, rows.size());
+ } else {
+ row = rows.subList(counter, counter + batch);
+ }
+ counter = counter + batch;
--- End diff --

ok, optimized

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228391972

--- Diff: examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderBenchmark.java ---
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.benchmark;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.util.CarbonProperties;
+import org.apache.carbondata.sdk.file.*;
+
+/**
+ * Test SDK read performance
+ */
+public class SDKReaderBenchmark {
--- End diff --

we need know the SDK reader performance, and then we can know JNI performance lose by comparing with CSDK performance. And @KanakaKumar told we should test SDK reader performance in 1.5.1.
example has two benchmark currentlyï¼ ConcurrentQueryBenchmark and SimpleQueryBenchmarkã

If there are lack some test or indepth (IO time, pages scanned and all) or granularï¼ we can add.

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228392301

--- Diff: hadoop/src/main/java/org/apache/carbondata/hadoop/CarbonRecordReader.java ---
@@ -116,6 +116,20 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext context)
return readSupport.readRow(carbonIterator.next());
}

+ /**
+ * get batch result
+ *
+ * @param batch batch size
+ * @return rows
+ */
+ public List<Object[]> getBatchValue(int batch) {
+ rowCount += batch;
--- End diff --

ok, optimized

---

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1036/

---

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9301/

---

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1249/

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228457234

--- Diff: store/CSDK/CarbonReader.cpp ---
@@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) {
return carbonReaderBuilderObject;
}

+jobject CarbonReader::withBatch(int batch) {
+ jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
+ jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch",
--- End diff --

GetObjectClass shouldn't null, we should check whether carbonReaderBuilderObject is null.
checked GetMethodID.
CallObjectMethodA of withBatch won't throw exception

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228458636

--- Diff: store/CSDK/CarbonReader.cpp ---
@@ -74,27 +75,41 @@ jobject CarbonReader::withHadoopConf(char *key, char *value) {
return carbonReaderBuilderObject;
}

+jobject CarbonReader::withBatch(int batch) {
+ jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
+ jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "withBatch",
+ "(I)Lorg/apache/carbondata/sdk/file/CarbonReaderBuilder;");
+
+ jvalue args[1];
+ args[0].i = batch;
+ carbonReaderBuilderObject = jniEnv->CallObjectMethodA(carbonReaderBuilderObject, buildID, args);
+ return carbonReaderBuilderObject;
+}
+
jobject CarbonReader::build() {
jclass carbonReaderBuilderClass = jniEnv->GetObjectClass(carbonReaderBuilderObject);
jmethodID buildID = jniEnv->GetMethodID(carbonReaderBuilderClass, "build",
"()Lorg/apache/carbondata/sdk/file/CarbonReader;");
carbonReaderObject = jniEnv->CallObjectMethod(carbonReaderBuilderObject, buildID);
+ carbonReader = jniEnv->GetObjectClass(carbonReaderObject);
+ hasNextID = jniEnv->GetMethodID(carbonReader, "hasNext", "()Z");
--- End diff --

this has been done in PR2792

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228459548

--- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -1723,4 +1723,93 @@ public void testReadNextCarbonRowWithProjection() {
}
}

+ @Test
+ public void testReadNextBatchRow() {
+ String path = "./carbondata";
+ try {
+ FileUtils.deleteDirectory(new File(path));
+
+ Field[] fields = new Field[12];
+ fields[0] = new Field("stringField", DataTypes.STRING);
+ fields[1] = new Field("shortField", DataTypes.SHORT);
+ fields[2] = new Field("intField", DataTypes.INT);
+ fields[3] = new Field("longField", DataTypes.LONG);
+ fields[4] = new Field("doubleField", DataTypes.DOUBLE);
+ fields[5] = new Field("boolField", DataTypes.BOOLEAN);
+ fields[6] = new Field("dateField", DataTypes.DATE);
+ fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
+ fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
+ fields[9] = new Field("varcharField", DataTypes.VARCHAR);
+ fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
+ fields[11] = new Field("floatField", DataTypes.FLOAT);
+ Map<String, String> map = new HashMap<>();
+ map.put("complex_delimiter_level_1", "#");
+ CarbonWriter writer = CarbonWriter.builder()
+ .outputPath(path)
+ .withLoadOptions(map)
+ .withCsvInput(new Schema(fields)).build();
+
+ for (int i = 0; i < 10; i++) {
+ String[] row2 = new String[]{
+ "robot" + (i % 10),
+ String.valueOf(i % 10000),
+ String.valueOf(i),
+ String.valueOf(Long.MAX_VALUE - i),
+ String.valueOf((double) i / 2),
+ String.valueOf(true),
+ "2019-03-02",
+ "2019-02-12 03:03:34",
+ "12.345",
+ "varchar",
+ "Hello#World#From#Carbon",
+ "1.23"
+ };
+ writer.write(row2);
+ }
+ writer.close();
+
+ // Read data
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
--- End diff --

ok, done

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228461385

--- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -90,6 +91,18 @@ public T readNextRow() throws IOException, InterruptedException {
return currentReader.getCurrentValue();
}

+ /**
+ * Read and return next batch row objects
+ */
+ public Object[] readNextBatchRow(int batch) throws Exception {
--- End diff --

ok, optimized. remove batch in readNextBatchRow and keep in withBatch

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228469192

--- Diff: store/CSDK/main.cpp ---
@@ -224,8 +406,81 @@ bool readFromS3(JNIEnv *env, char *argv[]) {
printf("%s\t", carbonRow.getDecimal(10));
printf("%f\t", carbonRow.getFloat(11));
printf("\n");
+ env->DeleteLocalRef(row);
+ env->DeleteLocalRef(array1);
+ }
+ gettimeofday(&read, NULL);
+ time = 1000000 * (read.tv_sec - start.tv_sec) + read.tv_usec - start.tv_usec;
+ printf("total lines is %d: build time: %lf, read time is %lf s, average speed is %lf records/s\n",
+ i, buildTime, time / 1000000.0, i / (time / 1000000.0));
+
+ reader.close();
+}
+
+/**
+ * read data from S3
+ * parameter is ak sk endpoint
+ *
+ * @param env jni env
+ * @param argv argument vector
+ * @return
+ */
+bool readPerformanceFromS3(JNIEnv *env, char *argv[]) {
--- End diff --

I want to test for big data(more than 100 million rows), I change the test case name, is it ok?

---

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228469216

--- Diff: store/CSDK/main.cpp ---
@@ -99,21 +102,187 @@ bool readFromLocalWithoutProjection(JNIEnv *env) {
printf("%s\t", carbonRow.getDecimal(10));
printf("%f\t", carbonRow.getFloat(11));
printf("\n");
+ env->DeleteLocalRef(row);
+ env->DeleteLocalRef(array1);
}

carbonReaderClass.close();
}

+/**
+ * test next Row Performance
+ *
+ * @param env jni env
+ * @return
+ */
+bool testNextRowPerformance(JNIEnv *env, char *path, int printNum, char *argv[], int argc) {
--- End diff --

I want to test for big data(more than 100 million rows), I change the test case name, is it ok?

---

123456 ... 9