Posted by
Mick Yuan on
Apr 02, 2018; 7:30am
URL: http://apache-carbondata-dev-mailing-list-archive.168.s1.nabble.com/Problem-on-carbondata-quering-performance-tuning-tp44031.html
Hi,all
I have a quering performane tuning case on carbondata.
*Enviroment is as below:*:
spark on yarn
4 nodemanagers
102G,55 cores each nodemanager
*Spark properties:*
spark.master yarn
spark.submit.deployMode client
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 2g
spark.executor.memory 90g
spark.executor.cores 54
spark.sql.hive.convertmetastoreParquet false
spark.executor.instances 10
spark.sql.shuffle.partitions 432
spark.speculation true
*Carbondata properties:*
carbon.storelocation=hdfs://hacluster/Opt/CarbonStore
carbon.ddl.base.hdfs.url=hdfs://hacluster/opt/data
carbon.badRecords.location=/opt/Carbon/Spark/badrecords
carbon.sort.file.buffer.size=80
carbon.graph.rowset.size=100000
carbon.number.of.cores.while.loading=24
carbon.sort.size=1000000
carbon.enableXXHash=true
carbon.number.of.cores.while.alterPartition=2
carbon.number.of.cores.while.compacting=2
carbon.compaction.level.threshold=4,3
carbon.major.compaction.size=1024
carbon.number.of.cores=4
carbon.inmemory.record.size=120000
*Create table sql:*
CREATE TABLE
yuan_yuan10_DATE_DIM
(
D_YEAR string,
D_MOY string,
D_DATE_SK string,
D_DATE_ID string,
D_DATE string,
D_MONTH_SEQ string,
D_WEEK_SEQ string,
D_QUARTER_SEQ string,
D_DOW string,
D_DOM string,
D_QOY string,
D_FY_YEAR string,
D_FY_QUARTER_SEQ string,
D_FY_WEEK_SEQ string,
D_DAY_NAME string,
D_QUARTER_NAME string,
D_HOLIDAY string,
D_WEEKEND string,
D_FOLLOWING_HOLIDAY string,
D_FIRST_DOM string,
D_LAST_DOM string,
D_SAME_DAY_LY string,
D_SAME_DAY_LQ string,
D_CURRENT_DAY string,
D_CURRENT_WEEK string,
D_CURRENT_MONTH string,
D_CURRENT_QUARTER string,
D_CURRENT_YEAR string,
REMARK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='D_YEAR,D_MOY,D_DATE_SK',
'DICTIONARY_INCLUDE'='D_MOY,D_YEAR,D_DATE_SK'
);
CREATE TABLE
yuan_yuan10_STORE_SALES
(
SS_SOLD_DATE_SK string,
SS_ITEM_SK string,
SS_CUSTOMER_SK string,
SS_STORE_SK string,
SS_SOLD_TIME_SK string,
SS_CDEMO_SK string,
SS_HDEMO_SK string,
SS_ADDR_SK string,
SS_PROMO_SK string,
SS_TICKET_NUMBER string,
SS_QUANTITY string,
SS_WHOLESALE_COST string,
SS_LIST_PRICE string,
SS_SALES_PRICE string,
SS_EXT_DISCOUNT_AMT string,
SS_EXT_SALES_PRICE string,
SS_EXT_WHOLESALE_COST string,
SS_EXT_LIST_PRICE string,
SS_EXT_TAX string,
SS_COUPON_AMT string,
SS_NET_PAID string,
SS_NET_PAID_INC_TAX string,
SS_NET_PROFIT string,
REMARK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='SS_SOLD_DATE_SK,SS_ITEM_SK,SS_CUSTOMER_SK',
'DICTIONARY_INCLUDE'='SS_SOLD_DATE_SK,SS_ITEM_SK,SS_CUSTOMER_SK'
);
CREATE TABLE
yuan_yuan10_ITEM
(
I_MANAGER_ID string,
I_ITEM_SK string,
I_ITEM_ID string,
I_REC_START_DATE string,
I_REC_END_DATE string,
I_ITEM_DESC string,
I_CURRENT_PRICE string,
I_WHOLESALE_COST string,
I_BRAND_ID string,
I_BRAND string,
I_CLASS_ID string,
I_CLASS string,
I_CATEGORY_ID string,
I_CATEGORY string,
I_MANUFACT_ID string,
I_MANUFACT string,
I_SIZE string,
I_FORMULATION string,
I_COLOR string,
I_UNITS string,
I_CONTAINER string,
I_PRODUCT_NAME string,
REMARK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='I_MANAGER_ID,I_ITEM_SK',
'DICTIONARY_INCLUDE'='I_MANAGER_ID,I_ITEM_SK'
);
CREATE TABLE
yuan_yuan10_CUSTOMER
(
C_CURRENT_ADDR_SK string,
C_CUSTOMER_SK string,
C_CUSTOMER_ID string,
C_CURRENT_CDEMO_SK string,
C_CURRENT_HDEMO_SK string,
C_FIRST_SHIPTO_DATE_SK string,
C_FIRST_SALES_DATE_SK string,
C_SALUTATION string,
C_FIRST_NAME string,
C_LAST_NAME string,
C_PREFERRED_CUST_FLAG string,
C_BIRTH_DAY string,
C_BIRTH_MONTH string,
C_BIRTH_YEAR string,
C_BIRTH_COUNTRY string,
C_LOGIN string,
C_EMAIL_ADDRESS string,
C_LAST_REVIEW_DATE string,
REMRK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='C_CURRENT_ADDR_SK,C_CUSTOMER_SK',
'DICTIONARY_INCLUDE'='C_CURRENT_ADDR_SK,C_CUSTOMER_SK'
);
CREATE TABLE
yuan_yuan10_CUSTOMER_ADDRESS
(
CA_ZIP string,
CA_ADDRESS_SK string,
CA_ADDRESS_ID string,
CA_STREET_NUMBER string,
CA_STREET_NAME string,
CA_STREET_TYPE string,
CA_SUITE_NUMBER string,
CA_CITY string,
CA_COUNTY string,
CA_STATE string,
CA_COUNTRY string,
CA_GMT_OFFSET string,
CA_LOCATION_TYPE string,
REMARK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='CA_ZIP,CA_ADDRESS_SK',
'DICTIONARY_INCLUDE'='CA_ZIP,CA_ADDRESS_SK'
);
CREATE TABLE
yuan_yuan10_STORE
(
S_ZIP string,
S_STORE_SK string,
S_STORE_ID string,
S_REC_START_DATE string,
S_REC_END_DATE string,
S_CLOSED_DATE_SK string,
S_STORE_NAME string,
S_NUMBER_EMPLOYEES string,
S_FLOOR_SPACE string,
S_HOURS string,
S_MANAGER string,
S_MARKET_ID string,
S_GEOGRAPHY_CLASS string,
S_MARKET_DESC string,
S_MARKET_MANAGER string,
S_DIVISION_ID string,
S_DIVISION_NAME string,
S_COMPANY_ID string,
S_COMPANY_NAME string,
S_STREET_NUMBER string,
S_STREET_NAME string,
S_STREET_TYPE string,
S_SUITE_NUMBER string,
S_CITY string,
S_COUNTY string,
S_STATE string,
S_COUNTRY string,
S_GMT_OFFSET string,
S_TAX_PRECENTAGE string,
REMRK string,
HYREN_S_DATE string,
HYREN_E_DATE string,
HYREN_MD5_VAL string
)
stored BY 'carbondata' TBLPROPERTIES
(
'TABLE_BLOCKSIZE'='256',
'SORT_COLUMNS'='S_ZIP,S_STORE_SK',
'DICTIONARY_INCLUDE'='S_ZIP,S_STORE_SK'
);
*Query sql:*
SELECT
i_brand_id brand_id,
i_brand brand,
i_manufact_id,
i_manufact,
SUM(ss_ext_sales_price) ext_price
FROM
yuan_yuan10_date_dim,
yuan_yuan10_store_sales,
yuan_yuan10_item,
yuan_yuan10_customer,
yuan_yuan10_customer_address,
yuan_yuan10_store
WHERE
d_date_sk = ss_sold_date_sk
AND ss_item_sk = i_item_sk
AND i_manager_id = 7
AND d_moy = 11
AND d_year = 1999
AND ss_customer_sk = c_customer_sk
AND c_current_addr_sk = ca_address_sk
AND SUBSTR(ca_zip, 1, 5) <> SUBSTR(s_zip, 1, 5)
AND ss_store_sk = s_store_sk
AND ss_sold_date_sk BETWEEN 2451484 AND 2451513 -- partition key filter
GROUP BY
i_brand,
i_brand_id,
i_manufact_id,
i_manufact
ORDER BY
ext_price DESC,
i_brand,
i_brand_id,
i_manufact_id,
i_manufact limit 100;
*Table records:*
all about 171G
yuan_yuan10_date_dim
73049
yuan_yuan10_store_sales
1439977468
yuan_yuan10_item
28000
yuan_yuan10_customer
2100000
yuan_yuan10_customer_address
1050000
yuan_yuan10_store
68
The query statement takes at least 6s,how can I reduce the time to 2s?
--
Sent from:
http://apache-carbondata-dev-mailing-list-archive.1130556.n5.nabble.com/