[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[GitHub] [carbondata] VenuReddy2103 commented on a change in pull request #3819: [CARBONDATA-3855]support carbon SDK to load data from different files

GitBox

VenuReddy2103 commented on a change in pull request #3819:
URL: https://github.com/apache/carbondata/pull/3819#discussion_r479962896



##########
File path: sdk/sdk/src/main/java/org/apache/carbondata/sdk/file/CarbonWriterBuilder.java
##########
@@ -594,6 +608,227 @@ public CarbonWriterBuilder withJsonInput(Schema carbonSchema) {
     return this;
   }
 
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading CSV files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withCsvPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.withCsvInput();
+    this.dataFiles = this.extractDataFiles(CarbonCommonConstants.CSV_FILE_EXTENSION);
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts CSV files directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the CSV file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withCsvPath(String filePath, List<String> fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withCsvPath(filePath);
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading JSON files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withJsonPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.withJsonInput();
+    this.dataFiles = this.extractDataFiles(CarbonCommonConstants.JSON_FILE_EXTENSION);
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts JSON file directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the json file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   * @throws IOException
+   */
+  public CarbonWriterBuilder withJsonPath(String filePath, List<String> fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withJsonPath(filePath);
+    return this;
+  }
+
+  private void validateFilePath(String filePath) {
+    if (StringUtils.isEmpty(filePath)) {
+      throw new IllegalArgumentException("filePath can not be empty");
+    }
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading Parquet files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withParquetPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.writerType = WRITER_TYPE.PARQUET;
+    CarbonFile[] dataFiles = this.extractDataFiles(CarbonCommonConstants.PARQUET_FILE_EXT);
+    org.apache.avro.Schema parquetSchema = ParquetCarbonWriter
+        .extractParquetSchema(dataFiles[0], this.hadoopConf);
+    this.dataFiles = dataFiles;
+    this.avroSchema = parquetSchema;
+    this.schema = AvroCarbonWriter.getCarbonSchemaFromAvroSchema(this.avroSchema);
+    return this;
+  }
+
+  private void setIsDirectory(String filePath) {
+    if (this.hadoopConf == null) {
+      this.hadoopConf = new Configuration(FileFactory.getConfiguration());
+    }
+    CarbonFile carbonFile = FileFactory.getCarbonFile(filePath, hadoopConf);
+    this.isDirectory = carbonFile.isDirectory();
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts parquet files directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the parquet file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   * @throws IOException
+   */
+  public CarbonWriterBuilder withParquetPath(String filePath, List<String> fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withParquetPath(filePath);
+    return this;
+  }
+
+  private CarbonFile[] extractDataFiles(String suf) {
+    List<CarbonFile> dataFiles;
+    if (this.isDirectory) {
+      if (CollectionUtils.isEmpty(this.fileList)) {
+        dataFiles = SDKUtil.extractFilesFromFolder(this.filePath, suf, this.hadoopConf);
+      } else {
+        dataFiles = this.appendFileListWithPath();
+      }
+    } else {
+      dataFiles = new ArrayList<>();
+      dataFiles.add(FileFactory.getCarbonFile(this.filePath, this.hadoopConf));
+    }
+    if (CollectionUtils.isEmpty(dataFiles)) {
+      throw new RuntimeException("Data files can't be empty.");
+    }
+    return dataFiles.toArray(new CarbonFile[0]);
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading ORC files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withOrcPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.writerType = WRITER_TYPE.ORC;
+    Map<String, String> options = new HashMap<>();
+    options.put("complex_delimiter_level_1",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_1_DEFAULT);
+    options.put("complex_delimiter_level_2",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_2_DEFAULT);
+    options.put("complex_delimiter_level_3",
+        CarbonCommonConstants.COMPLEX_DELIMITERS_LEVEL_3_DEFAULT);
+    this.withLoadOptions(options);
+    CarbonFile[] dataFiles = this.extractDataFiles(CarbonCommonConstants.ORC_FILE_EXTENSION);
+    this.dataFiles = dataFiles;
+    this.schema = ORCCarbonWriter.extractOrcFileSchema(dataFiles[0], this.hadoopConf);
+    return this;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts orc files directory and
+   * list of file which has to be loaded.
+   *
+   * @param filePath directory where the orc file exists.
+   * @param fileList list of files which has to be loaded.
+   * @return CarbonWriterBuilder
+   * @throws IOException
+   */
+  public CarbonWriterBuilder withOrcPath(String filePath, List<String> fileList)
+      throws IOException {
+    this.fileList = fileList;
+    this.withOrcPath(filePath);
+    return this;
+  }
+
+  private List<CarbonFile> appendFileListWithPath() {
+    List<CarbonFile> dataFiles = new ArrayList<>();
+    for (String file : this.fileList) {
+      dataFiles.add(FileFactory.getCarbonFile(this.filePath +
+          CarbonCommonConstants.FILE_SEPARATOR + file, this.hadoopConf));
+    }
+    return dataFiles;
+  }
+
+  /**
+   * to build a {@link CarbonWriter}, which accepts loading AVRO files.
+   *
+   * @param filePath absolute path under which files should be loaded.
+   * @return CarbonWriterBuilder
+   */
+  public CarbonWriterBuilder withAvroPath(String filePath) throws IOException {
+    this.validateFilePath(filePath);
+    this.filePath = filePath;
+    this.setIsDirectory(filePath);
+    this.writerType = WRITER_TYPE.AVRO;
+    CarbonFile[] dataFiles = this.extractDataFiles(CarbonCommonConstants.AVRO_FILE_EXTENSION);
+    DataFileStream<GenericData.Record> avroReader = null;
+    try {
+      avroReader = AvroCarbonWriter.buildAvroReader(dataFiles[0], this.hadoopConf);

Review comment:
       Have this functionality in the AvroCarbonWriter. And it can just return the schema.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[hidden email]