[ https://issues.apache.org/jira/browse/CARBONDATA-296?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15588762#comment-15588762 ] ASF GitHub Bot commented on CARBONDATA-296: ------------------------------------------- Github user QiangCai commented on a diff in the pull request: https://github.com/apache/incubator-carbondata/pull/233#discussion_r84068389 --- Diff: hadoop/src/test/java/org/apache/carbondata/hadoop/csv/CSVInputFormatTest.java --- @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.carbondata.hadoop.csv; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.carbondata.hadoop.io.StringArrayWritable; + +import junit.framework.TestCase; +import org.junit.Assert; +import org.junit.Test; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.io.compress.CompressionOutputStream; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.Lz4Codec; +import org.apache.hadoop.io.compress.SnappyCodec; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; + +public class CSVInputFormatTest extends TestCase { + + /** + * generate compressed files, no need to call this method. + * @throws Exception + */ + public void testGenerateCompressFiles() throws Exception { + String pwd = new File("src/test/resources").getCanonicalPath(); + String inputFile = pwd + "/data.csv"; + FileInputStream input = new FileInputStream(inputFile); + Configuration conf = new Configuration(); + + // .gz + String outputFile = pwd + "/data.csv.gz"; + FileOutputStream output = new FileOutputStream(outputFile); + GzipCodec gzip = new GzipCodec(); + gzip.setConf(conf); + CompressionOutputStream outputStream = gzip.createOutputStream(output); + int i = -1; + while ((i = input.read()) != -1) { + outputStream.write(i); + } + outputStream.close(); + input.close(); + + // .bz2 + input = new FileInputStream(inputFile); + outputFile = pwd + "/data.csv.bz2"; + output = new FileOutputStream(outputFile); + BZip2Codec bzip2 = new BZip2Codec(); + bzip2.setConf(conf); + outputStream = bzip2.createOutputStream(output); + i = -1; + while ((i = input.read()) != -1) { + outputStream.write(i); + } + outputStream.close(); + input.close(); + + // .snappy + input = new FileInputStream(inputFile); + outputFile = pwd + "/data.csv.snappy"; + output = new FileOutputStream(outputFile); + SnappyCodec snappy = new SnappyCodec(); + snappy.setConf(conf); + outputStream = snappy.createOutputStream(output); + i = -1; + while ((i = input.read()) != -1) { + outputStream.write(i); + } + outputStream.close(); + input.close(); + + //.lz4 + input = new FileInputStream(inputFile); + outputFile = pwd + "/data.csv.lz4"; + output = new FileOutputStream(outputFile); + Lz4Codec lz4 = new Lz4Codec(); + lz4.setConf(conf); + outputStream = lz4.createOutputStream(output); + i = -1; + while ((i = input.read()) != -1) { + outputStream.write(i); + } + outputStream.close(); + input.close(); + + } + + /** + * CSVCheckMapper check the content of csv files. + */ + public static class CSVCheckMapper extends Mapper<NullWritable, StringArrayWritable, NullWritable, + NullWritable> { + @Override + protected void map(NullWritable key, StringArrayWritable value, Context context) + throws IOException, InterruptedException { + String[] columns = value.get(); + int id = Integer.parseInt(columns[0]); + int salary = Integer.parseInt(columns[6]); + Assert.assertEquals(id - 1, salary - 15000); --- End diff -- I think it is enough to check two column. > 1.Add CSVInputFormat to read csv files. > --------------------------------------- > > Key: CARBONDATA-296 > URL: https://issues.apache.org/jira/browse/CARBONDATA-296 > Project: CarbonData > Issue Type: Sub-task > Reporter: Ravindra Pesala > Assignee: QiangCai > Fix For: 0.2.0-incubating > > > Add CSVInputFormat to read csv files, it should use Univocity parser to read csv files to get optimal performance. -- This message was sent by Atlassian JIRA (v6.3.4#6332) |
Free forum by Nabble | Edit this page |