[jira] [Commented] (CARBONDATA-296) 1.Add CSVInputFormat to read csv files.

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[jira] [Commented] (CARBONDATA-296) 1.Add CSVInputFormat to read csv files.

Akash R Nilugal (Jira)

    [ https://issues.apache.org/jira/browse/CARBONDATA-296?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15575358#comment-15575358 ]

ASF GitHub Bot commented on CARBONDATA-296:
-------------------------------------------

Github user jackylk commented on a diff in the pull request:

    https://github.com/apache/incubator-carbondata/pull/233#discussion_r83422823
 
    --- Diff: hadoop/src/test/java/org/apache/carbondata/hadoop/csv/CSVInputFormatTest.java ---
    @@ -0,0 +1,171 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing,
    + * software distributed under the License is distributed on an
    + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    + * KIND, either express or implied.  See the License for the
    + * specific language governing permissions and limitations
    + * under the License.
    + */
    +package org.apache.carbondata.hadoop.csv;
    +
    +import java.io.File;
    +import java.io.FileInputStream;
    +import java.io.FileOutputStream;
    +import java.io.IOException;
    +
    +import org.apache.carbondata.hadoop.io.StringArrayWritable;
    +
    +import junit.framework.TestCase;
    +import org.junit.Assert;
    +import org.junit.Test;
    +import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.fs.Path;
    +import org.apache.hadoop.io.NullWritable;
    +import org.apache.hadoop.io.compress.BZip2Codec;
    +import org.apache.hadoop.io.compress.CompressionOutputStream;
    +import org.apache.hadoop.io.compress.GzipCodec;
    +import org.apache.hadoop.io.compress.Lz4Codec;
    +import org.apache.hadoop.io.compress.SnappyCodec;
    +import org.apache.hadoop.mapreduce.Job;
    +import org.apache.hadoop.mapreduce.Mapper;
    +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    +
    +public class CSVInputFormatTest extends TestCase {
    +
    +  /**
    +   * generate compressed files, no need to call this method.
    +   * @throws Exception
    +   */
    +  public void testGenerateCompressFiles() throws Exception {
    +    String pwd = new File("src/test/resources").getCanonicalPath();
    +    String inputFile = pwd + "/data.csv";
    +    FileInputStream input = new FileInputStream(inputFile);
    +    Configuration conf = new Configuration();
    +
    +    // .gz
    +    String outputFile = pwd + "/data.csv.gz";
    +    FileOutputStream output = new FileOutputStream(outputFile);
    +    GzipCodec gzip = new GzipCodec();
    +    gzip.setConf(conf);
    +    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    +    int i = -1;
    +    while ((i = input.read()) != -1) {
    +      outputStream.write(i);
    +    }
    +    outputStream.close();
    +    input.close();
    +
    +    // .bz2
    +    input = new FileInputStream(inputFile);
    +    outputFile = pwd + "/data.csv.bz2";
    +    output = new FileOutputStream(outputFile);
    +    BZip2Codec bzip2 = new BZip2Codec();
    +    bzip2.setConf(conf);
    +    outputStream = bzip2.createOutputStream(output);
    +    i = -1;
    +    while ((i = input.read()) != -1) {
    +      outputStream.write(i);
    +    }
    +    outputStream.close();
    +    input.close();
    +
    +    // .snappy
    +    input = new FileInputStream(inputFile);
    +    outputFile = pwd + "/data.csv.snappy";
    +    output = new FileOutputStream(outputFile);
    +    SnappyCodec snappy = new SnappyCodec();
    +    snappy.setConf(conf);
    +    outputStream = snappy.createOutputStream(output);
    +    i = -1;
    +    while ((i = input.read()) != -1) {
    +      outputStream.write(i);
    +    }
    +    outputStream.close();
    +    input.close();
    +
    +    //.lz4
    +    input = new FileInputStream(inputFile);
    +    outputFile = pwd + "/data.csv.lz4";
    +    output = new FileOutputStream(outputFile);
    +    Lz4Codec lz4 = new Lz4Codec();
    +    lz4.setConf(conf);
    +    outputStream = lz4.createOutputStream(output);
    +    i = -1;
    +    while ((i = input.read()) != -1) {
    +      outputStream.write(i);
    +    }
    +    outputStream.close();
    +    input.close();
    +
    +  }
    +
    +  /**
    +   * CSVCheckMapper check the content of csv files.
    +   */
    +  public static class CSVCheckMapper extends Mapper<NullWritable, StringArrayWritable, NullWritable,
    +      NullWritable> {
    +    @Override
    +    protected void map(NullWritable key, StringArrayWritable value, Context context)
    +        throws IOException, InterruptedException {
    +      String[] columns = value.get();
    +      int id = Integer.parseInt(columns[0]);
    +      int salary = Integer.parseInt(columns[6]);
    +      Assert.assertEquals(id - 1, salary - 15000);
    --- End diff --
   
    Why not assert for each one?


> 1.Add CSVInputFormat to read csv files.
> ---------------------------------------
>
>                 Key: CARBONDATA-296
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-296
>             Project: CarbonData
>          Issue Type: Sub-task
>            Reporter: Ravindra Pesala
>            Assignee: QiangCai
>             Fix For: 0.2.0-incubating
>
>
> Add CSVInputFormat to read csv files, it should use Univocity parser to read csv files to get optimal performance.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)