[ https://issues.apache.org/jira/browse/CARBONDATA-1655?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] cen yuhai updated CARBONDATA-1655: ---------------------------------- Description: I have a table which has 4 billion records, I find that the getSplits function is too slow! getSplit spent 20s!!! {code} "main" #1 prio=5 os_prio=0 tid=0x00007fcc94013000 nid=0x5ed5 runnable [0x00007fcc992b6000] java.lang.Thread.State: RUNNABLE at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:155) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getLengthInBytes(UnsafeDataMapRow.java:61) at org.apache.carbondata.core.indexstore.row.DataMapRow.getSizeInBytes(DataMapRow.java:80) at org.apache.carbondata.core.indexstore.row.DataMapRow.getTotalSizeInBytes(DataMapRow.java:70) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getRow(UnsafeDataMapRow.java:89) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getByteArray(UnsafeDataMapRow.java:43) at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.createBlocklet(BlockletDataMap.java:310) at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.prune(BlockletDataMap.java:268) at org.apache.carbondata.core.datamap.TableDataMap.prune(TableDataMap.java:66) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getDataBlocksOfSegment(CarbonTableInputFormat.java:524) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:453) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:324) at org.apache.carbondata.spark.rdd.CarbonScanRDD.getPartitions(CarbonScanRDD.scala:84) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:258) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) at scala.Option.getOrElse(Option.scala:121) ``` {code} spark-sql> select dt from dm_test.table_carbondata limit 1; NULL Time taken: 20.94 seconds, Fetched 1 row(s) was: I have a table which has 4 billion records, I find that the getSplits function is too slow! getSplit spent 20s!!! {code} "main" #1 prio=5 os_prio=0 tid=0x00007fcc94013000 nid=0x5ed5 runnable [0x00007fcc992b6000] java.lang.Thread.State: RUNNABLE at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:155) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getLengthInBytes(UnsafeDataMapRow.java:61) at org.apache.carbondata.core.indexstore.row.DataMapRow.getSizeInBytes(DataMapRow.java:80) at org.apache.carbondata.core.indexstore.row.DataMapRow.getTotalSizeInBytes(DataMapRow.java:70) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getRow(UnsafeDataMapRow.java:89) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getByteArray(UnsafeDataMapRow.java:43) at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.createBlocklet(BlockletDataMap.java:310) at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.prune(BlockletDataMap.java:268) at org.apache.carbondata.core.datamap.TableDataMap.prune(TableDataMap.java:66) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getDataBlocksOfSegment(CarbonTableInputFormat.java:524) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:453) at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:324) at org.apache.carbondata.spark.rdd.CarbonScanRDD.getPartitions(CarbonScanRDD.scala:84) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:258) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) at scala.Option.getOrElse(Option.scala:121) ``` {code} spark-sql> select dt from dm_test.dm_trd_order_wide_carbondata limit 1; NULL Time taken: 20.94 seconds, Fetched 1 row(s) > getSplits function is very slow !!! > ----------------------------------- > > Key: CARBONDATA-1655 > URL: https://issues.apache.org/jira/browse/CARBONDATA-1655 > Project: CarbonData > Issue Type: Bug > Components: data-query > Reporter: cen yuhai > > I have a table which has 4 billion records, I find that the getSplits function is too slow! > getSplit spent 20s!!! > {code} > "main" #1 prio=5 os_prio=0 tid=0x00007fcc94013000 nid=0x5ed5 runnable [0x00007fcc992b6000] > java.lang.Thread.State: RUNNABLE > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:155) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getLengthInBytes(UnsafeDataMapRow.java:61) > at org.apache.carbondata.core.indexstore.row.DataMapRow.getSizeInBytes(DataMapRow.java:80) > at org.apache.carbondata.core.indexstore.row.DataMapRow.getTotalSizeInBytes(DataMapRow.java:70) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getRow(UnsafeDataMapRow.java:89) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getSizeInBytes(UnsafeDataMapRow.java:161) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getPosition(UnsafeDataMapRow.java:170) > at org.apache.carbondata.core.indexstore.row.UnsafeDataMapRow.getByteArray(UnsafeDataMapRow.java:43) > at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.createBlocklet(BlockletDataMap.java:310) > at org.apache.carbondata.core.indexstore.blockletindex.BlockletDataMap.prune(BlockletDataMap.java:268) > at org.apache.carbondata.core.datamap.TableDataMap.prune(TableDataMap.java:66) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getDataBlocksOfSegment(CarbonTableInputFormat.java:524) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:453) > at org.apache.carbondata.hadoop.api.CarbonTableInputFormat.getSplits(CarbonTableInputFormat.java:324) > at org.apache.carbondata.spark.rdd.CarbonScanRDD.getPartitions(CarbonScanRDD.scala:84) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:258) > at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:260) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:258) > at scala.Option.getOrElse(Option.scala:121) > ``` > {code} > spark-sql> select dt from dm_test.table_carbondata limit 1; > NULL > Time taken: 20.94 seconds, Fetched 1 row(s) -- This message was sent by Atlassian JIRA (v6.4.14#64029) |
Free forum by Nabble | Edit this page |