question about dimension's sort order in blocklet level

Posted by Liang Chen on
URL: http://apache-carbondata-dev-mailing-list-archive.168.s1.nabble.com/Re-DISCUSSION-Initiating-Apache-CarbonData-1-1-0-incubating-Release-tp9672p9748.html

Hi

Can you provide one table to show your info, can't see very clear?

The column of high cardinality(>1000000) would not do dictionary.

Regards
Liang

2017-03-27 14:32 GMT+05:30 马云 <simafengyun1984@163.com>:

> Hi DEV,
>
>  I create table according to the below SQL
>
>     cc.sql("""
>
>            CREATE TABLE IF NOT EXISTS t3
>
>            (ID Int,
>
> date Timestamp,
>
> country String,
>
> name String,
>
> phonetype String,
>
> serialname String,
>
> salary Int,
>
> name1 String,
>
> name2 String,
>
> name3 String,
>
> name4 String,
>
> name5 String,
>
> name6 String,
>
> name7 String,
>
> name8 String
>
>            )
>
>            STORED BY 'carbondata'
>
>            """)
>
>
>
> data cardinality as below.
>
> |
>
> column cardinality
>
> |
> |
>
> name
>
> |
>
> name1
>
> |
>
> name2
>
> |
>
> name3
>
> |
>
> name4
>
> |
>
> name5
>
> |
>
> name6
>
> |
>
> name7
>
> |
>
> name8
>
> |
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
> 10000000
>
> |
>
>
>
> after I load data to this table, I found the dimension columns "name" and
> "name7"  both have no dictionary encode.
>
> but column "name" has no inverted index and column "name7" has inverted
> index
>
> questions:
>
> 1. the dimension column name  has dictionary decode, but have no inverted
> index, does its' data still have order in DataChunk2 blocklet?
>
> 2. is there any document to introduce these loading strategies?
>
>
> 3. if a dimension column has  no dictionary decode  and no inverted
> index,  user also didn't specify the column with no inverted index when
> create table
>     does its' data still have order in DataChunk2 blocklet?
>
> 4. as I know, by default, all dimension column data are sorted and stored
> in DataChunk2 blocklet  except user specify the column with no inverted
> index, right?
>
> 5. as I know the first dimension column of mdk key is always sorted in
> DataChunk2 blocklet, why not set the isExplicitSorted to true?
>
>
>
>  the attached is used to generate the data.csv
>
> package test;
>
>
>
>
> import java.io.BufferedOutputStream;
>
> import java.io.File;
>
> import java.io.FileOutputStream;
>
> import java.io.FileWriter;
>
> import java.util.HashMap;
>
> import java.util.Map;
>
>
>
>
> publicclass CreateData {
>
>
>
>
>   public CreateData() {
>
>
>
>
>   }
>
>
>
>
>   publicstaticvoid main(String[] args) {
>
>
>
>
>     FileOutputStream out = null;
>
>
>
>
>     FileOutputStream outSTr = null;
>
>
>
>
>     BufferedOutputStream Buff = null;
>
>
>
>
>     FileWriter fw = null;
>
>
>
>
>     intcount = 1000;// 写文件行数
>
>
>
>
>     try {
>
>
>
>
>       outSTr = new FileOutputStream(new File("data.csv"));
>
>
>
>
>       Buff = new BufferedOutputStream(outSTr);
>
>
>
>
>       longbegin0 = System.currentTimeMillis();
>
>       Buff.write(
>
>           "ID,date,country,name,phonetype,serialname,salary,
> name1,name2,name3,name4,name5,name6,name7,name8\n"
>
>               .getBytes());
>
>
>
>
>       intidcount = 10000000;
>
>       intdatecount = 30;
>
>       intcountrycount = 5;
>
>       // intnamecount =5000000;
>
>       intphonetypecount = 10000;
>
>       intserialnamecount = 50000;
>
>       // intsalarycount = 200000;
>
>       Map<Integer, String> countryMap = new HashMap<Integer, String>();
>
>       countryMap.put(1, "usa");
>
>       countryMap.put(2, "uk");
>
>       countryMap.put(3, "china");
>
>       countryMap.put(4, "indian");
>
>       countryMap.put(0, "canada");
>
>
>
>
>       StringBuilder sb = null;
>
>       for (inti = idcount; i >= 0; i--) {
>
>
>
>
>         sb = new StringBuilder();
>
>         sb.append(4000000 + i).append(",");// id
>
>         sb.append("2015/8/" + (i % datecount + 1)).append(",");
>
>         sb.append(countryMap.get(i % countrycount)).append(",");
>
>         sb.append("name" + (1600000 - i)).append(",");// name
>
>         sb.append("phone" + i % phonetypecount).append(",");
>
>         sb.append("serialname" + (100000 + i %
> serialnamecount)).append(",");// serialname
>
>         sb.append(i + 500000).append(",");
>
>         sb.append("name1" + (i + 100000)).append(",");// name
>
>         sb.append("name2" + (i + 200000)).append(",");// name
>
>         sb.append("name3" + (i + 300000)).append(",");// name
>
>         sb.append("name4" + (i + 400000)).append(",");// name
>
>         sb.append("name5" + (i + 500000)).append(",");// name
>
>         sb.append("name6" + (i + 600000)).append(",");// name
>
>         sb.append("name7" + (i + 700000)).append(",");// name
>
>         sb.append("name8" + (i + 800000)).append(",").append('\n');
>
>
>
>
>         Buff.write(sb.toString().getBytes());
>
>
>
>
>       }
>
>
>
>
>       Buff.flush();
>
>
>
>
>       Buff.close();
>
>       System.out.println("sb.toString():" + sb.toString());
>
>       longend0 = System.currentTimeMillis();
>
>
>
>
>       System.out.println("BufferedOutputStream执行耗时:" + (end0 - begin0) +
> " 豪秒");
>
>
>
>
>     } catch (Exception e) {
>
>
>
>
>       e.printStackTrace();
>
>
>
>
>     }
>
>
>
>
>     finally {
>
>
>
>
>       try {
>
>
>
>
>         // fw.close();
>
>
>
>
>         Buff.close();
>
>
>
>
>         outSTr.close();
>
>
>
>
>         // out.close();
>
>
>
>
>       } catch (Exception e) {
>
>
>
>
>         e.printStackTrace();
>
>
>
>
>       }
>
>
>
>
>     }
>
>
>
>
>   }
>
>
>
>
> }




--
Regards
Liang