Apache CarbonData Dev Mailing List archive

delete Invalid tuple id datablock lost

Classic

List

Threaded

2 messages Options

yixu2001

Jan 02, 2018; 9:20am

delete Invalid tuple id datablock lost

dev

spark2.1+ carbondata1.1.1

When executed SQL "delete from e_carbon.offer_inst_c where offer_inst_id in (select offer_inst_id from cache_offer_inst_U)",
following info will occur:
"Delete data operation is failed: [[java.io

.IOException: java.lang.Exception: Invalid tuple id 0/927.37/0-11_batchno0-0-1514560063689"

After that, I found some records are deleted, not only the data in table cache_offer_inst_U, but also the whole data block of "0-11_batchno0-0-1514560063689".
For example, there are 100,000 records in the table "cache_offer_inst_U" , but actually it delete 2,000,000 records.

Detailed log

java.lang.Exception: ===outer exception:[[[java.lang.Exception: step2.6 1 times failed,error executing sql:[ delete from e_carbon.offer_inst_c where offer_inst_id in (select offer_inst_ id from cache_offer_inst_U)] causeerror:[[[NULL]]] error:[[[java.lang.RuntimeException: Delete data operation is failed: [[java.io.IOException: java.lang.Exception: Invalid tuple id 0/92 7.37/0-11_batchno0-0-1514560063689 at org.apache.carbondata.hadoop.CarbonInputFormat.getSplits(CarbonInputFormat.java:351) at org.apache.carbondata.hadoop.CarbonInputFormat.getS plits(CarbonInputFormat.java:269) at org.apache.carbondata.spark.rdd.CarbonScanRDD.getPartitions(CarbonScanRDD.scala:81) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply( RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:2 50) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark. rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPar titionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply( RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitio nsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getO rElse(Option.scala:121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) at org.apache. spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250) at scala.Option.getOrElse(Option.scala:121) at org.a pache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91) at org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.sc ala:91) at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:239) at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:237) at scala.Opt ion.getOrElse(Option.scala:121) at org.apache.spark.rdd.RDD.dependencies(RDD.scala:237) at org.apache.spark.scheduler.DAGScheduler.getShuffleDependencies(DAGScheduler.scala:4 24) at org.apache.spark.scheduler.DAGScheduler.getOrCreateParentStages(DAGScheduler.scala:373) at org.apache.spark.scheduler.DAGScheduler.createResultStage(DAGScheduler.scala: 360) at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:838) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala: 1613) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSche duler.scala:1594) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) at org.apache. spark.SparkContext.runJob(SparkContext.scala:1925) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938) at org.apache.spark.SparkContext.runJob(SparkContext.sca la:1951) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965) at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936) at org.apache.spark.rdd.RD DOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScop e(RDD.scala:362) at org.apache.spark.rdd.RDD.collect(RDD.scala:935) at org.apache.spark.sql.execution.command.deleteExecution$.deleteDeltaExecution(IUDCommands.scala:606) at org .apache.spark.sql.execution.command.ProjectForDeleteCommand.run(IUDCommands.scala:98) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(comman ds.scala:58) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doEx ecute(commands.scala:74) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute $1.apply(SparkPlan.scala:114) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135) at org.apache.spark.rdd.RDDOperationScope$.withScope(R DDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92) at org.apache.spark.sql.Dataset.<init>(Dataset.scala:185) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64) at org.apache.spark.sql.SparkSession.sql(SparkSessio n.scala:592) at cn.ffcs.carbon.clienttouch.ProcessOracle6$.exeSqlWithRetry(ProcessOracle6.scala:564) at cn.ffcs.carbon.clienttouch.ProcessOracle6$$anonfun$triggeredAppendB atch$2.apply(ProcessOracle6.scala:281) at cn.ffcs.carbon.clienttouch.ProcessOracle6$$anonfun$triggeredAppendBatch$2.apply(ProcessOracle6.scala:277) at scala.collection.IndexedSeqO

yixu2001

sounak

Jan 02, 2018; 9:57am

Re: delete Invalid tuple id datablock lost

Can you please re-verify this issue in the latest code base?

On Tue, Jan 2, 2018 at 2:50 PM, yixu2001 <[hidden email]> wrote:

> dev
>
> spark2.1+ carbondata1.1.1
>
> When executed SQL "delete from e_carbon.offer_inst_c
> where offer_inst_id in (select offer_inst_id from cache_offer_inst_U)",
> following info will occur:
> "Delete data operation is failed: [[java.io
>
> .IOException: java.lang.Exception: Invalid tuple id 0/
> 927.37/0-11_batchno0-0-1514560063689"
>
> After that, I found some records are deleted, not only
> the data in table cache_offer_inst_U, but also the whole
> data block of "0-11_batchno0-0-1514560063689".
> For example, there are 100,000 records in the table "
> cache_offer_inst_U" , but actually it delete 2,000,000 records.
>
> Detailed log
> java.lang.Exception: ===outer exception:[[[java.lang.
> Exception: step2.6 1 times failed,error executing sql:[
> delete from e_carbon.offer_inst_c where offer_inst_id in
> (select offer_inst_ id from cache_offer_inst_U)]
> causeerror:[[[NULL]]] error:[[[java.lang.RuntimeException:
> Delete data operation is failed: [[java.io.IOException:
> java.lang.Exception: Invalid tuple id 0/92 7.37/0-11_
> batchno0-0-1514560063689 at org.apache.carbondata.hadoop.
> CarbonInputFormat.getSplits(CarbonInputFormat.java:351)
> at org.apache.carbondata.hadoop.CarbonInputFormat.getS
> plits(CarbonInputFormat.java:269) at org.apache.
> carbondata.spark.rdd.CarbonScanRDD.getPartitions(
> CarbonScanRDD.scala:81) at org.apache.spark.rdd.RDD$$
> anonfun$partitions$2.apply( RDD.scala:252) at org.
> apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:
> 250) at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:2 50)
> at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(
> MapPartitionsRDD.scala:35) at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.
> apache.spark. rdd.RDD$$anonfun$partitions$2.apply(
> RDD.scala:250) at scala.Option.getOrElse(Option.scala:
> 121) at org.apache.spark.rdd.RDD.partitions(RDD.scala:
> 250) at org.apache.spark.rdd.MapPar titionsRDD.getPartitions(
> MapPartitionsRDD.scala:35) at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:252) at org.
> apache.spark.rdd.RDD$$anonfun$partitions$2.apply( RDD.
> scala:250) at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
> at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitio
> nsRDD.scala:35) at org.apache.spark.rdd.RDD$$anonfun$
> partitions$2.apply(RDD.scala:252) at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:250) at
> scala.Option.getO rElse(Option.scala:121) at org.
> apache.spark.rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.rdd.
> MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> at org.apache. spark.rdd.RDD$$anonfun$partitions$2.
> apply(RDD.scala:252) at org.apache.spark.rdd.RDD$$anonfun$
> partitions$2.apply(RDD.scala:250) at scala.Option.
> getOrElse(Option.scala:121) at org.a pache.spark.
> rdd.RDD.partitions(RDD.scala:250) at org.apache.spark.
> ShuffleDependency.<init>(Dependency.scala:91) at
> org.apache.spark.rdd.ShuffledRDD.getDependencies(
> ShuffledRDD.sc ala:91) at org.apache.spark.rdd.
> RDD$$anonfun$dependencies$2.apply(RDD.scala:239)
> at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(
> RDD.scala:237) at scala.Opt ion.getOrElse(
> Option.scala:121) at org.apache.spark.rdd.RDD.
> dependencies(RDD.scala:237) at org.apache.spark.
> scheduler.DAGScheduler.getShuffleDependencies(
> DAGScheduler.scala:4 24) at org.apache.spark.
> scheduler.DAGScheduler.getOrCreateParentStages(
> DAGScheduler.scala:373) at org.apache.spark.scheduler.DAGScheduler.
> createResultStage(DAGScheduler.scala: 360) at org.apache.spark.
> scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:838) at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> doOnReceive(DAGScheduler.scala: 1613) at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1605) at org.apache.spark.scheduler.
> DAGSchedulerEventProcessLoop.onReceive(DAGSche duler.
> scala:1594) at org.apache.spark.util.EventLoop$$anon$1.
> run(EventLoop.scala:48) at org.apache.spark.scheduler.
> DAGScheduler.runJob(DAGScheduler.scala:628)
> at org.apache. spark.SparkContext.runJob(
> SparkContext.scala:1925) at org.apache.spark.SparkContext.runJob(
> SparkContext.scala:1938) at org.apache.spark.SparkContext.runJob(
> SparkContext.sca la:1951) at org.apache.spark.
> SparkContext.runJob(SparkContext.scala:1965)
> at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.
> scala:936) at org.apache.spark.rdd.RD DOperationScope$.withScope(
> RDDOperationScope.scala:151) at org.apache.spark.rdd.
> RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScop e(RDD.scala:
> 362) at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
> at org.apache.spark.sql.execution.command.deleteExecution$.
> deleteDeltaExecution(IUDCommands.scala:606) at org .apache.spark.sql.
> execution.command.ProjectForDeleteCommand.run(
> IUDCommands.scala:98) at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.sideEffectResult$lzycompute(
> comman ds.scala:58) at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.sideEffectResult(commands.
> scala:56) at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.doEx ecute(commands.scala:74)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$
> execute$1.apply(SparkPlan.scala:114) at org.
> apache.spark.sql.execution.SparkPlan$$anonfun$execute
> $1.apply(SparkPlan.scala:114) at org.apache.spark.
> sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(
> SparkPlan.scala:135) at org.apache.spark.rdd.
> RDDOperationScope$.withScope(R DDOperationScope.scala:
> 151) at org.apache.spark.sql.execution.SparkPlan.
> executeQuery(SparkPlan.scala:132) at org.apache.spark.
> sql.execution.SparkPlan.execute(SparkPlan.scala:113)
> at org.apache.spark.sql.execution.QueryExecution.toRdd$
> lzycompute(QueryExecution.scala:92) at org.apache.
> spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
> at org.apache.spark.sql.Dataset.<init>(Dataset.
> scala:185) at org.apache.spark.sql.Dataset$.
> ofRows(Dataset.scala:64) at org.apache.spark.sql.
> SparkSession.sql(SparkSessio n.scala:592) at cn.
> ffcs.carbon.clienttouch.ProcessOracle6$.exeSqlWithRetry(
> ProcessOracle6.scala:564) at cn.ffcs.carbon.
> clienttouch.ProcessOracle6$$anonfun$triggeredAppendB
> atch$2.apply(ProcessOracle6.scala:281) at cn.ffcs.carbon.clienttouch.
> ProcessOracle6$$anonfun$triggeredAppendBatch$2.apply(
> ProcessOracle6.scala:277) at scala.collection.IndexedSeqO
> ------------------------------
> yixu2001
>

... [show rest of quote]

--
Thanks
Sounak