Apache CarbonData Dev Mailing List archive › Apache CarbonData JIRA issues

[jira] [Commented] (CARBONDATA-296) 1.Add CSVInputFormat to read csv files.

Classic

List

Threaded

1 message

Akash R Nilugal (Jira)

[jira] [Commented] (CARBONDATA-296) 1.Add CSVInputFormat to read csv files.

[ https://issues.apache.org/jira/browse/CARBONDATA-296?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15588762#comment-15588762 ]

ASF GitHub Bot commented on CARBONDATA-296:
-------------------------------------------

Github user QiangCai commented on a diff in the pull request:

https://github.com/apache/incubator-carbondata/pull/233#discussion_r84068389

--- Diff: hadoop/src/test/java/org/apache/carbondata/hadoop/csv/CSVInputFormatTest.java ---
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.carbondata.hadoop.csv;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.carbondata.hadoop.io.StringArrayWritable;
+
+import junit.framework.TestCase;
+import org.junit.Assert;
+import org.junit.Test;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.io.compress.Lz4Codec;
+import org.apache.hadoop.io.compress.SnappyCodec;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+public class CSVInputFormatTest extends TestCase {
+
+ /**
+ * generate compressed files, no need to call this method.
+ * @throws Exception
+ */
+ public void testGenerateCompressFiles() throws Exception {
+ String pwd = new File("src/test/resources").getCanonicalPath();
+ String inputFile = pwd + "/data.csv";
+ FileInputStream input = new FileInputStream(inputFile);
+ Configuration conf = new Configuration();
+
+ // .gz
+ String outputFile = pwd + "/data.csv.gz";
+ FileOutputStream output = new FileOutputStream(outputFile);
+ GzipCodec gzip = new GzipCodec();
+ gzip.setConf(conf);
+ CompressionOutputStream outputStream = gzip.createOutputStream(output);
+ int i = -1;
+ while ((i = input.read()) != -1) {
+ outputStream.write(i);
+ }
+ outputStream.close();
+ input.close();
+
+ // .bz2
+ input = new FileInputStream(inputFile);
+ outputFile = pwd + "/data.csv.bz2";
+ output = new FileOutputStream(outputFile);
+ BZip2Codec bzip2 = new BZip2Codec();
+ bzip2.setConf(conf);
+ outputStream = bzip2.createOutputStream(output);
+ i = -1;
+ while ((i = input.read()) != -1) {
+ outputStream.write(i);
+ }
+ outputStream.close();
+ input.close();
+
+ // .snappy
+ input = new FileInputStream(inputFile);
+ outputFile = pwd + "/data.csv.snappy";
+ output = new FileOutputStream(outputFile);
+ SnappyCodec snappy = new SnappyCodec();
+ snappy.setConf(conf);
+ outputStream = snappy.createOutputStream(output);
+ i = -1;
+ while ((i = input.read()) != -1) {
+ outputStream.write(i);
+ }
+ outputStream.close();
+ input.close();
+
+ //.lz4
+ input = new FileInputStream(inputFile);
+ outputFile = pwd + "/data.csv.lz4";
+ output = new FileOutputStream(outputFile);
+ Lz4Codec lz4 = new Lz4Codec();
+ lz4.setConf(conf);
+ outputStream = lz4.createOutputStream(output);
+ i = -1;
+ while ((i = input.read()) != -1) {
+ outputStream.write(i);
+ }
+ outputStream.close();
+ input.close();
+
+ }
+
+ /**
+ * CSVCheckMapper check the content of csv files.
+ */
+ public static class CSVCheckMapper extends Mapper<NullWritable, StringArrayWritable, NullWritable,
+ NullWritable> {
+ @Override
+ protected void map(NullWritable key, StringArrayWritable value, Context context)
+ throws IOException, InterruptedException {
+ String[] columns = value.get();
+ int id = Integer.parseInt(columns[0]);
+ int salary = Integer.parseInt(columns[6]);
+ Assert.assertEquals(id - 1, salary - 15000);
--- End diff --

I think it is enough to check two column.

> 1.Add CSVInputFormat to read csv files.
> ---------------------------------------
>
> Key: CARBONDATA-296
> URL: https://issues.apache.org/jira/browse/CARBONDATA-296
> Project: CarbonData
> Issue Type: Sub-task
> Reporter: Ravindra Pesala
> Assignee: QiangCai
> Fix For: 0.2.0-incubating
>
>
> Add CSVInputFormat to read csv files, it should use Univocity parser to read csv files to get optimal performance.

--
This message was sent by Atlassian JIRA
(v6.3.4#6332)