Apache CarbonData Dev Mailing List archive › Apache CarbonData JIRA issues

[GitHub] carbondata pull request #2816: [CARBONDATA-300] Suppor read batch row in CSD...

Classic

List

Threaded

168 messages Options

1234567 ... 9

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.1.0, Please check CI http://136.243.101.176:8080/job/ApacheCarbonPRBuilder2.1/1041/

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228477978

--- Diff: examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderExampleForBigData.java ---
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.benchmark;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.util.CarbonProperties;
+import org.apache.carbondata.sdk.file.*;
+
+/**
+ * Test SDK read example for big data
+ */
+public class SDKReaderExampleForBigData {
+ public static void main(String[] args) throws InterruptedException, InvalidLoadOptionException, IOException {
+ System.out.println("start to read data");
+ String path = "../../../../Downloads/carbon-data-big";
+ if (args.length > 0) {
+ path = args[0];
+ }
+ double num = 1000000000.0;
+ String originPath = "../../../../Downloads/carbon-data";
+ String newPath = "../../../../Downloads/carbon-data-big";
+ boolean writeNewData = false;
+ if (writeNewData) {
+ extendData(originPath, newPath);
+ }
+
+ Configuration conf = new Configuration();
+ if (args.length > 3) {
+ conf.set("fs.s3a.access.key", args[1]);
+ conf.set("fs.s3a.secret.key", args[2]);
+ conf.set("fs.s3a.endpoint", args[3]);
+ }
+ readNextBatchRow(path, num, conf, 100000, 100000);
+ readNextRow(path, num, conf, 100000);
+ }
+
+ public static void readNextRow(String path, double num, Configuration conf, int printNum) {
+ System.out.println("readNextRow");
+ try {
+ // Read data
+ Long startTime = System.nanoTime();
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
+ .withHadoopConf(conf)
+ .build();
+
+ Long startReadTime = System.nanoTime();
+ System.out.println("build time is " + (startReadTime - startTime) / num);
+ int i = 0;
+ while (reader.hasNext()) {
+ Object[] data = (Object[]) reader.readNextRow();
+ i++;
+ if (i % printNum == 0) {
+ Long point = System.nanoTime();
+ System.out.print(i + ": time is " + (point - startReadTime) / num
+ + " s, speed is " + (i / ((point - startReadTime) / num)) + " records/s \t");
+ for (int j = 0; j < data.length; j++) {
+ System.out.print(data[j] + "\t\t");
+ }
+ System.out.println();
+ }
+ }
+ Long endReadTime = System.nanoTime();
+ System.out.println("total lines is " + i + ", build time is " + (startReadTime - startTime) / num
+ + " s, \ttotal read time is " + (endReadTime - startReadTime) / num
+ + " s, \taverage speed is " + (i / ((endReadTime - startReadTime) / num))
+ + " records/s.");
+ reader.close();
+ } catch (Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * read next batch row
+ *
+ * @param path data path
+ * @param num number for time
+ * @param conf configuration
+ * @param batch batch size
+ * @param printNum print number for each batch
+ */
+ public static void readNextBatchRow(String path, double num, Configuration conf, int batch, int printNum) {
+ System.out.println("readNextBatchRow");
+ try {
+ // Read data
+ Long startTime = System.nanoTime();
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
+ .withHadoopConf(conf)
+ .withBatch(batch)
+ .build();
+
+ Long startReadTime = System.nanoTime();
+ Long startBatchReadTime = startReadTime;
+ System.out.println("build time is " + (startBatchReadTime - startTime) / num);
+ int i = 0;
+ long startHasNext = System.nanoTime();
+ while (reader.hasNext()) {
+ Long endHasNext = System.nanoTime();
+
+ Object[] batchRow = reader.readNextBatchRow();
+ for (int k = 0; k < batchRow.length; k++) {
+ Object[] data = (Object[]) batchRow[k];
+ i++;
+ if (i > 0 && i % printNum == 0) {
+ Long point = System.nanoTime();
+ System.out.print(i + ": time is " + (point - startBatchReadTime) / num
+ + " s, \tspeed is " + (printNum / ((point - startBatchReadTime) / num))
+ + " records/s, \thasNext time is " + (endHasNext - startHasNext) / num + " s\t");
+ for (int j = 0; j < data.length; j++) {
+ System.out.print(data[j] + "\t\t");
+ }
+ System.out.println();
+ startBatchReadTime = System.nanoTime();
+ }
+ }
+ startHasNext = System.nanoTime();
+ }
+ Long endReadTime = System.nanoTime();
+ System.out.println("total lines is " + i + ", build time is " + (startReadTime - startTime) / num
+ + " s, \ttotal read time is " + (endReadTime - startReadTime) / num
+ + " s, \taverage speed is " + (i / ((endReadTime - startReadTime) / num))
+ + " records/s.");
+ reader.close();
+ } catch (Throwable e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static Schema readSchema(String path) throws IOException {
+ File[] dataFiles = new File(path).listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ if (name == null) {
+ return false;
+ }
+ return name.endsWith("carbondata");
+ }
+ });
+ if (dataFiles == null || dataFiles.length < 1) {
+ throw new RuntimeException("Carbon index file not exists.");
+ }
+ Schema schema = CarbonSchemaReader
+ .readSchemaInDataFile(dataFiles[0].getAbsolutePath())
+ .asOriginOrder();
+ return schema;
+ }
+
+ /**
+ * extend data
+ * read origin path data and generate new data in new path,
+ * the new data is bigger than origin data
+ *
+ * @param originPath origin path of data
+ * @param newPath new path of data
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws InvalidLoadOptionException
+ */
+ public static void extendData(String originPath, String newPath)
+ throws IOException, InterruptedException, InvalidLoadOptionException {
--- End diff --

we don't need this.
1) Benchmarking read performace if you want to check, just have a simple exmpale that takes path of files and reads it and prints the time. why extendData is required ?
2) Also below readSchema(originPath) is called for writing carbonfiles. we should not write any files.

just you can keep an example for read from path and print time. thats all

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228479559

--- Diff: examples/spark2/src/main/java/org/apache/carbondata/benchmark/SDKReaderBenchmark.java ---
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.benchmark;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.sql.Timestamp;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.carbondata.common.exceptions.sql.InvalidLoadOptionException;
+import org.apache.carbondata.core.constants.CarbonCommonConstants;
+import org.apache.carbondata.core.util.CarbonProperties;
+import org.apache.carbondata.sdk.file.*;
+
+/**
+ * Test SDK read performance
+ */
+public class SDKReaderBenchmark {
--- End diff --

**you can have reader benchmarking code,**

but it should just be a reader code that takes path and prints the rows and time taken.
But this example is extending data (writing data) That should not be there.
Also for everything separate S3 example not required. current examples only set conf and it works. So don't add multiple examples also.

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228480901

--- Diff: store/CSDK/main.cpp ---
@@ -99,21 +102,187 @@ bool readFromLocalWithoutProjection(JNIEnv *env) {
printf("%s\t", carbonRow.getDecimal(10));
printf("%f\t", carbonRow.getFloat(11));
printf("\n");
+ env->DeleteLocalRef(row);
+ env->DeleteLocalRef(array1);
}

carbonReaderClass.close();
}

+/**
+ * test next Row Performance
+ *
+ * @param env jni env
+ * @return
+ */
+bool testNextRowPerformance(JNIEnv *env, char *path, int printNum, char *argv[], int argc) {
--- End diff --

Example code must be independent of data. It should work for small data as well as bigdata.
Also as you know we cannot keep huge data testcase in automation, PR builder will take time. So we avoid huge data test case. It is DFX scenario.

---

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Failed with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1254/

---

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.3.1, Please check CI http://136.243.101.176:8080/job/carbondataprbuilder2.3/9306/

---

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user CarbonDataQA commented on the issue:

https://github.com/apache/carbondata/pull/2816

Build Success with Spark 2.2.1, Please check CI http://95.216.28.178:8080/job/ApacheCarbonPRBuilder1/1269/

---

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

qiuchenjian-2

[GitHub] carbondata issue #2816: [CARBONDATA-3003] Suppor read batch row in CSDK

In reply to this post by qiuchenjian-2

Github user xubo245 commented on the issue:

https://github.com/apache/carbondata/pull/2816

@ajantha-bhat Please review again.

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228926855

--- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java ---
@@ -100,4 +100,25 @@ public int getSize() {
counter++;
return row;
}
+
+ /**
+ * read next batch
+ *
+ * @param batch batch size
+ * @return rows
+ */
+ public List<Object[]> nextBatch(int batch) {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ List<Object[]> row;
+ if (counter + batch > rows.size()) {
+ row = rows.subList(counter, rows.size());
+ counter = counter + row.size();
--- End diff --

isn't it counter = row.size() ?

because we are copying rows.size()-counter size data. So it is like counter = counter + (row.size()-counter) ?

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228932392

--- Diff: store/CSDK/test/main.cpp ---
@@ -220,6 +393,86 @@ bool tryCatchException(JNIEnv *env) {
*/
bool readFromS3(JNIEnv *env, char *argv[]) {
printf("\nRead data from S3:\n");
+ struct timeval start, build, read;
--- End diff --

Like we discussed in the previous PR, separate S3 test cases are not required. only thing we do in s3 testcase is calling withHadoopConf API. user can call this if he needs to run in S3 environment.

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228932584

--- Diff: store/CSDK/test/main.cpp ---
@@ -220,6 +393,86 @@ bool tryCatchException(JNIEnv *env) {
*/
bool readFromS3(JNIEnv *env, char *argv[]) {
printf("\nRead data from S3:\n");
+ struct timeval start, build, read;
+ gettimeofday(&start, NULL);
+
+ CarbonReader reader;
+
+ char *args[3];
+ // "your access key"
+ args[0] = argv[1];
+ // "your secret key"
+ args[1] = argv[2];
+ // "your endPoint"
+ args[2] = argv[3];
+
+ reader.builder(env, "s3a://sdk/WriterOutput/carbondata", "test");
+ reader.withHadoopConf("fs.s3a.access.key", argv[1]);
+ reader.withHadoopConf("fs.s3a.secret.key", argv[2]);
+ reader.withHadoopConf("fs.s3a.endpoint", argv[3]);
+ reader.build();
+
+ gettimeofday(&build, NULL);
+ int time = 1000000 * (build.tv_sec - start.tv_sec) + build.tv_usec - start.tv_usec;
+ int buildTime = time / 1000000.0;
+ printf("build time: %lf s\n", time / 1000000.0);
+
+ CarbonRow carbonRow(env);
+ int i = 0;
+ while (reader.hasNext()) {
+ jobject row = reader.readNextRow();
+ i++;
+ carbonRow.setCarbonRow(row);
+
+ printf("%s\t", carbonRow.getString(0));
+ printf("%d\t", carbonRow.getInt(1));
+ printf("%ld\t", carbonRow.getLong(2));
+ printf("%s\t", carbonRow.getVarchar(3));
+ jobjectArray arr = carbonRow.getArray(4);
+ jsize length = env->GetArrayLength(arr);
+ int j = 0;
+ for (j = 0; j < length; j++) {
+ jobject element = env->GetObjectArrayElement(arr, j);
+ char *str = (char *) env->GetStringUTFChars((jstring) element, JNI_FALSE);
+ printf("%s\t", str);
+ }
+ env->DeleteLocalRef(arr);
+ printf("%d\t", carbonRow.getShort(5));
+ printf("%d\t", carbonRow.getInt(6));
+ printf("%ld\t", carbonRow.getLong(7));
+ printf("%lf\t", carbonRow.getDouble(8));
+ bool bool1 = carbonRow.getBoolean(9);
+ if (bool1) {
+ printf("true\t");
+ } else {
+ printf("false\t");
+ }
+ printf("%s\t", carbonRow.getDecimal(10));
+ printf("%f\t", carbonRow.getFloat(11));
+ printf("\n");
+ env->DeleteLocalRef(row);
+ }
+ gettimeofday(&read, NULL);
+ time = 1000000 * (read.tv_sec - start.tv_sec) + read.tv_usec - start.tv_usec;
+ printf("total lines is %d: build time: %lf, read time is %lf s, average speed is %lf records/s\n",
+ i, buildTime, time / 1000000.0, i / (time / 1000000.0));
+
+ reader.close();
+}
+
+/**
+ * read data from S3
+ * parameter is ak sk endpoint
+ *
+ * @param env jni env
+ * @param argv argument vector
+ * @return
+ */
+bool readFromS3ForBigData(JNIEnv *env, char **argv) {
--- End diff --

same as above

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228932987

--- Diff: store/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonReader.java ---
@@ -90,6 +93,20 @@ public T readNextRow() throws IOException, InterruptedException {
return currentReader.getCurrentValue();
}

+ /**
+ * Read and return next batch row objects
+ */
+ public Object[] readNextBatchRow() throws Exception {
+ validateReader();
+ int batch = Integer.parseInt(CarbonProperties.getInstance()
--- End diff --

What is this property is not set ? we get NPE here

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user ajantha-bhat commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r228935418

--- Diff: store/sdk/src/test/java/org/apache/carbondata/sdk/file/CarbonReaderTest.java ---
@@ -1737,4 +1739,95 @@ public void testReadNextRowWithProjectionAndRowUtil() {
}
}

+ @Test
+ public void testReadNextBatchRow() {
+ String path = "./carbondata";
+ try {
+ FileUtils.deleteDirectory(new File(path));
+
+ Field[] fields = new Field[12];
+ fields[0] = new Field("stringField", DataTypes.STRING);
+ fields[1] = new Field("shortField", DataTypes.SHORT);
+ fields[2] = new Field("intField", DataTypes.INT);
+ fields[3] = new Field("longField", DataTypes.LONG);
+ fields[4] = new Field("doubleField", DataTypes.DOUBLE);
+ fields[5] = new Field("boolField", DataTypes.BOOLEAN);
+ fields[6] = new Field("dateField", DataTypes.DATE);
+ fields[7] = new Field("timeField", DataTypes.TIMESTAMP);
+ fields[8] = new Field("decimalField", DataTypes.createDecimalType(8, 2));
+ fields[9] = new Field("varcharField", DataTypes.VARCHAR);
+ fields[10] = new Field("arrayField", DataTypes.createArrayType(DataTypes.STRING));
+ fields[11] = new Field("floatField", DataTypes.FLOAT);
+ Map<String, String> map = new HashMap<>();
+ map.put("complex_delimiter_level_1", "#");
+ CarbonWriter writer = CarbonWriter.builder()
+ .outputPath(path)
+ .withLoadOptions(map)
+ .withCsvInput(new Schema(fields))
+ .writtenBy("CarbonReaderTest")
+ .build();
+
+ for (int i = 0; i < 10; i++) {
+ String[] row2 = new String[]{
+ "robot" + (i % 10),
+ String.valueOf(i % 10000),
+ String.valueOf(i),
+ String.valueOf(Long.MAX_VALUE - i),
+ String.valueOf((double) i / 2),
+ String.valueOf(true),
+ "2019-03-02",
+ "2019-02-12 03:03:34",
+ "12.345",
+ "varchar",
+ "Hello#World#From#Carbon",
+ "1.23"
+ };
+ writer.write(row2);
+ }
+ writer.close();
+
+ // Read data
+ CarbonReader reader = CarbonReader
+ .builder(path, "_temp")
+ .withBatch(3)
+ .build();
+
+ int i = 0;
+ while (reader.hasNext()) {
+ Object[] batch = reader.readNextBatchRow();
+
+ for (int j = 0; j < batch.length; j++) {
--- End diff --

need to validate whether the batch size is same as we set or lesser than that (for last batch). Must not be greater than our set batch size

---

qiuchenjian-2

[GitHub] carbondata pull request #2816: [CARBONDATA-3003] Suppor read batch row in CS...

In reply to this post by qiuchenjian-2

Github user xubo245 commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2816#discussion_r229155198

--- Diff: core/src/main/java/org/apache/carbondata/core/scan/result/RowBatch.java ---
@@ -100,4 +100,25 @@ public int getSize() {
counter++;
return row;
}
+
+ /**
+ * read next batch
+ *
+ * @param batch batch size
+ * @return rows
+ */
+ public List<Object[]> nextBatch(int batch) {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ List<Object[]> row;
+ if (counter + batch > rows.size()) {
+ row = rows.subList(counter, rows.size());
+ counter = counter + row.size();
--- End diff --

counter != row.size() before change readNextBatchRow(batch) to readNextBatchRow(). the batch is different between withBatch(batch) and readNextBatchRow(batch) before. But after change, the batch is the same, so counter = row.size()

---

1234567 ... 9