Login  Register

Problem on carbondata quering performance tuning

Posted by Mick Yuan on Apr 02, 2018; 7:30am
URL: http://apache-carbondata-dev-mailing-list-archive.168.s1.nabble.com/Problem-on-carbondata-quering-performance-tuning-tp44031.html

Hi,all
  I have a quering performane tuning case on carbondata.

  *Enviroment is as below:*:
  spark on yarn
  4 nodemanagers
  102G,55 cores each nodemanager

  *Spark properties:*
spark.master yarn
spark.submit.deployMode client
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 2g
spark.executor.memory 90g
spark.executor.cores 54
spark.sql.hive.convertmetastoreParquet false
spark.executor.instances 10
spark.sql.shuffle.partitions 432
spark.speculation true
  *Carbondata properties:*
carbon.storelocation=hdfs://hacluster/Opt/CarbonStore
carbon.ddl.base.hdfs.url=hdfs://hacluster/opt/data
carbon.badRecords.location=/opt/Carbon/Spark/badrecords
carbon.sort.file.buffer.size=80
carbon.graph.rowset.size=100000
carbon.number.of.cores.while.loading=24
carbon.sort.size=1000000
carbon.enableXXHash=true
carbon.number.of.cores.while.alterPartition=2
carbon.number.of.cores.while.compacting=2
carbon.compaction.level.threshold=4,3
carbon.major.compaction.size=1024
carbon.number.of.cores=4
carbon.inmemory.record.size=120000

  *Create table sql:*
CREATE TABLE
    yuan_yuan10_DATE_DIM
    (
        D_YEAR string,
        D_MOY string,
        D_DATE_SK string,
        D_DATE_ID string,
        D_DATE string,
        D_MONTH_SEQ string,
        D_WEEK_SEQ string,
        D_QUARTER_SEQ string,
        D_DOW string,
        D_DOM string,
        D_QOY string,
        D_FY_YEAR string,
        D_FY_QUARTER_SEQ string,
        D_FY_WEEK_SEQ string,
        D_DAY_NAME string,
        D_QUARTER_NAME string,
        D_HOLIDAY string,
        D_WEEKEND string,
        D_FOLLOWING_HOLIDAY string,
        D_FIRST_DOM string,
        D_LAST_DOM string,
        D_SAME_DAY_LY string,
        D_SAME_DAY_LQ string,
        D_CURRENT_DAY string,
        D_CURRENT_WEEK string,
        D_CURRENT_MONTH string,
        D_CURRENT_QUARTER string,
        D_CURRENT_YEAR string,
        REMARK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='D_YEAR,D_MOY,D_DATE_SK',
        'DICTIONARY_INCLUDE'='D_MOY,D_YEAR,D_DATE_SK'
    );



CREATE TABLE
    yuan_yuan10_STORE_SALES
    (
        SS_SOLD_DATE_SK string,
        SS_ITEM_SK string,
        SS_CUSTOMER_SK string,
        SS_STORE_SK string,
        SS_SOLD_TIME_SK string,
        SS_CDEMO_SK string,
        SS_HDEMO_SK string,
        SS_ADDR_SK string,
        SS_PROMO_SK string,
        SS_TICKET_NUMBER string,
        SS_QUANTITY string,
        SS_WHOLESALE_COST string,
        SS_LIST_PRICE string,
        SS_SALES_PRICE string,
        SS_EXT_DISCOUNT_AMT string,
        SS_EXT_SALES_PRICE string,
        SS_EXT_WHOLESALE_COST string,
        SS_EXT_LIST_PRICE string,
        SS_EXT_TAX string,
        SS_COUPON_AMT string,
        SS_NET_PAID string,
        SS_NET_PAID_INC_TAX string,
        SS_NET_PROFIT string,
        REMARK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='SS_SOLD_DATE_SK,SS_ITEM_SK,SS_CUSTOMER_SK',
        'DICTIONARY_INCLUDE'='SS_SOLD_DATE_SK,SS_ITEM_SK,SS_CUSTOMER_SK'
    );


CREATE TABLE
    yuan_yuan10_ITEM
    (
        I_MANAGER_ID string,
        I_ITEM_SK string,
        I_ITEM_ID string,
        I_REC_START_DATE string,
        I_REC_END_DATE string,
        I_ITEM_DESC string,
        I_CURRENT_PRICE string,
        I_WHOLESALE_COST string,
        I_BRAND_ID string,
        I_BRAND string,
        I_CLASS_ID string,
        I_CLASS string,
        I_CATEGORY_ID string,
        I_CATEGORY string,
        I_MANUFACT_ID string,
        I_MANUFACT string,
        I_SIZE string,
        I_FORMULATION string,
        I_COLOR string,
        I_UNITS string,
        I_CONTAINER string,
        I_PRODUCT_NAME string,
        REMARK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='I_MANAGER_ID,I_ITEM_SK',
        'DICTIONARY_INCLUDE'='I_MANAGER_ID,I_ITEM_SK'
    );


CREATE TABLE
    yuan_yuan10_CUSTOMER
    (
        C_CURRENT_ADDR_SK string,
        C_CUSTOMER_SK string,
        C_CUSTOMER_ID string,
        C_CURRENT_CDEMO_SK string,
        C_CURRENT_HDEMO_SK string,
        C_FIRST_SHIPTO_DATE_SK string,
        C_FIRST_SALES_DATE_SK string,
        C_SALUTATION string,
        C_FIRST_NAME string,
        C_LAST_NAME string,
        C_PREFERRED_CUST_FLAG string,
        C_BIRTH_DAY string,
        C_BIRTH_MONTH string,
        C_BIRTH_YEAR string,
        C_BIRTH_COUNTRY string,
        C_LOGIN string,
        C_EMAIL_ADDRESS string,
        C_LAST_REVIEW_DATE string,
        REMRK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='C_CURRENT_ADDR_SK,C_CUSTOMER_SK',
        'DICTIONARY_INCLUDE'='C_CURRENT_ADDR_SK,C_CUSTOMER_SK'
    );



CREATE TABLE
    yuan_yuan10_CUSTOMER_ADDRESS
    (
        CA_ZIP string,
        CA_ADDRESS_SK string,
        CA_ADDRESS_ID string,
        CA_STREET_NUMBER string,
        CA_STREET_NAME string,
        CA_STREET_TYPE string,
        CA_SUITE_NUMBER string,
        CA_CITY string,
        CA_COUNTY string,
        CA_STATE string,
        CA_COUNTRY string,
        CA_GMT_OFFSET string,
        CA_LOCATION_TYPE string,
        REMARK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='CA_ZIP,CA_ADDRESS_SK',
        'DICTIONARY_INCLUDE'='CA_ZIP,CA_ADDRESS_SK'
    );


CREATE TABLE
    yuan_yuan10_STORE
    (
        S_ZIP string,
        S_STORE_SK string,
        S_STORE_ID string,
        S_REC_START_DATE string,
        S_REC_END_DATE string,
        S_CLOSED_DATE_SK string,
        S_STORE_NAME string,
        S_NUMBER_EMPLOYEES string,
        S_FLOOR_SPACE string,
        S_HOURS string,
        S_MANAGER string,
        S_MARKET_ID string,
        S_GEOGRAPHY_CLASS string,
        S_MARKET_DESC string,
        S_MARKET_MANAGER string,
        S_DIVISION_ID string,
        S_DIVISION_NAME string,
        S_COMPANY_ID string,
        S_COMPANY_NAME string,
        S_STREET_NUMBER string,
        S_STREET_NAME string,
        S_STREET_TYPE string,
        S_SUITE_NUMBER string,
        S_CITY string,
        S_COUNTY string,
        S_STATE string,
        S_COUNTRY string,
        S_GMT_OFFSET string,
        S_TAX_PRECENTAGE string,
        REMRK string,
        HYREN_S_DATE string,
        HYREN_E_DATE string,
        HYREN_MD5_VAL string
    )
    stored BY 'carbondata' TBLPROPERTIES
    (
        'TABLE_BLOCKSIZE'='256',
        'SORT_COLUMNS'='S_ZIP,S_STORE_SK',
        'DICTIONARY_INCLUDE'='S_ZIP,S_STORE_SK'
    );

*Query sql:*
SELECT
    i_brand_id brand_id,
    i_brand    brand,
    i_manufact_id,
    i_manufact,
    SUM(ss_ext_sales_price) ext_price
FROM
    yuan_yuan10_date_dim,
    yuan_yuan10_store_sales,
    yuan_yuan10_item,
    yuan_yuan10_customer,
    yuan_yuan10_customer_address,
    yuan_yuan10_store
WHERE
    d_date_sk = ss_sold_date_sk
AND ss_item_sk = i_item_sk
AND i_manager_id = 7
AND d_moy = 11
AND d_year = 1999
AND ss_customer_sk = c_customer_sk
AND c_current_addr_sk = ca_address_sk
AND SUBSTR(ca_zip, 1, 5) <> SUBSTR(s_zip, 1, 5)
AND ss_store_sk = s_store_sk
AND ss_sold_date_sk BETWEEN 2451484 AND 2451513 -- partition key filter
GROUP BY
    i_brand,
    i_brand_id,
    i_manufact_id,
    i_manufact
ORDER BY
    ext_price DESC,
    i_brand,
    i_brand_id,
    i_manufact_id,
    i_manufact limit 100;

*Table records:*
all about 171G
yuan_yuan10_date_dim
 73049
 yuan_yuan10_store_sales
 1439977468
 yuan_yuan10_item
 28000
 yuan_yuan10_customer
 2100000
 yuan_yuan10_customer_address
 1050000
 yuan_yuan10_store
 68

The query statement takes at least 6s,how can I reduce the time to 2s?




 
 



--
Sent from: http://apache-carbondata-dev-mailing-list-archive.1130556.n5.nabble.com/