delete Invalid tuple id datablock lost

classic Classic list List threaded Threaded
2 messages Options
Reply | Threaded
Open this post in threaded view
|

delete Invalid tuple id datablock lost

yixu2001
dev 
 
spark2.1+ carbondata1.1.1

When executed SQL "delete from  e_carbon.offer_inst_c where offer_inst_id in (select offer_inst_id from cache_offer_inst_U)", 
following info will occur:
"Delete data operation is failed: [[java.io 

.IOException: java.lang.Exception: Invalid tuple id 0/927.37/0-11_batchno0-0-1514560063689"

After that, I found some records are deleted, not only the data in table cache_offer_inst_U, but also the whole data block of "0-11_batchno0-0-1514560063689".
For example, there are 100,000 records in the table "cache_offer_inst_U" , but actually it delete 2,000,000 records.
 
Detailed log
java.lang.Exception: ===outer exception:[[[java.lang.Exception: step2.6 1 times failed,error executing sql:[ delete from  e_carbon.offer_inst_c where offer_inst_id in (select offer_inst_     id from cache_offer_inst_U)] causeerror:[[[NULL]]] error:[[[java.lang.RuntimeException: Delete data operation is failed: [[java.io.IOException: java.lang.Exception: Invalid tuple id 0/92     7.37/0-11_batchno0-0-1514560063689  at org.apache.carbondata.hadoop.CarbonInputFormat.getSplits(CarbonInputFormat.java:351)         at org.apache.carbondata.hadoop.CarbonInputFormat.getS     plits(CarbonInputFormat.java:269)         at org.apache.carbondata.spark.rdd.CarbonScanRDD.getPartitions(CarbonScanRDD.scala:81)  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(     RDD.scala:252)  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)  at scala.Option.getOrElse(Option.scala:121)     at org.apache.spark.rdd.RDD.partitions(RDD.scala:2     50)   at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)       at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.apache.spark.     rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)  at scala.Option.getOrElse(Option.scala:121)     at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.rdd.MapPar     titionsRDD.getPartitions(MapPartitionsRDD.scala:35)       at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(     RDD.scala:250)  at scala.Option.getOrElse(Option.scala:121)     at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitio     nsRDD.scala:35)       at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)  at scala.Option.getO     rElse(Option.scala:121)     at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)       at org.apache.     spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)  at scala.Option.getOrElse(Option.scala:121)     at org.a     pache.spark.rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)       at org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.sc     ala:91)       at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:239)        at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:237)        at scala.Opt     ion.getOrElse(Option.scala:121)     at org.apache.spark.rdd.RDD.dependencies(RDD.scala:237)         at org.apache.spark.scheduler.DAGScheduler.getShuffleDependencies(DAGScheduler.scala:4     24)       at org.apache.spark.scheduler.DAGScheduler.getOrCreateParentStages(DAGScheduler.scala:373)      at org.apache.spark.scheduler.DAGScheduler.createResultStage(DAGScheduler.scala:     360)    at org.apache.spark.scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:838)   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:     1613)         at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGSche     duler.scala:1594)   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)      at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)       at org.apache.     spark.SparkContext.runJob(SparkContext.scala:1925)        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)        at org.apache.spark.SparkContext.runJob(SparkContext.sca     la:1951)        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)        at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)     at org.apache.spark.rdd.RD     DOperationScope$.withScope(RDDOperationScope.scala:151)       at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)       at org.apache.spark.rdd.RDD.withScop     e(RDD.scala:362)    at org.apache.spark.rdd.RDD.collect(RDD.scala:935)      at org.apache.spark.sql.execution.command.deleteExecution$.deleteDeltaExecution(IUDCommands.scala:606)  at org     .apache.spark.sql.execution.command.ProjectForDeleteCommand.run(IUDCommands.scala:98)     at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(comman     ds.scala:58)    at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)       at org.apache.spark.sql.execution.command.ExecutedCommandExec.doEx     ecute(commands.scala:74)      at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)       at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute     $1.apply(SparkPlan.scala:114)       at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)  at org.apache.spark.rdd.RDDOperationScope$.withScope(R     DDOperationScope.scala:151)       at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)   at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)             at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)      at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)              at org.apache.spark.sql.Dataset.<init>(Dataset.scala:185)       at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)       at org.apache.spark.sql.SparkSession.sql(SparkSessio     n.scala:592)        at cn.ffcs.carbon.clienttouch.ProcessOracle6$.exeSqlWithRetry(ProcessOracle6.scala:564)         at cn.ffcs.carbon.clienttouch.ProcessOracle6$$anonfun$triggeredAppendB     atch$2.apply(ProcessOracle6.scala:281)    at cn.ffcs.carbon.clienttouch.ProcessOracle6$$anonfun$triggeredAppendBatch$2.apply(ProcessOracle6.scala:277)    at scala.collection.IndexedSeqO

yixu2001
Reply | Threaded
Open this post in threaded view
|

Re: delete Invalid tuple id datablock lost

sounak
Can you please re-verify this issue in the latest code base?

On Tue, Jan 2, 2018 at 2:50 PM, yixu2001 <[hidden email]> wrote:

> dev
>
> spark2.1+ carbondata1.1.1
>
> When executed SQL "delete from  e_carbon.offer_inst_c
> where offer_inst_id in (select offer_inst_id from cache_offer_inst_U)",
> following info will occur:
> "Delete data operation is failed: [[java.io
>
> .IOException: java.lang.Exception: Invalid tuple id 0/
> 927.37/0-11_batchno0-0-1514560063689"
>
> After that, I found some records are deleted, not only
> the data in table cache_offer_inst_U, but also the whole
> data block of "0-11_batchno0-0-1514560063689".
> For example, there are 100,000 records in the table "
> cache_offer_inst_U" , but actually it delete 2,000,000 records.
>
> Detailed log
> java.lang.Exception: ===outer exception:[[[java.lang.
> Exception: step2.6 1 times failed,error executing sql:[
> delete from  e_carbon.offer_inst_c where offer_inst_id in
> (select offer_inst_     id from cache_offer_inst_U)]
> causeerror:[[[NULL]]] error:[[[java.lang.RuntimeException:
> Delete data operation is failed: [[java.io.IOException:
>  java.lang.Exception: Invalid tuple id 0/92     7.37/0-11_
> batchno0-0-1514560063689  at org.apache.carbondata.hadoop.
> CarbonInputFormat.getSplits(CarbonInputFormat.java:351)
>       at org.apache.carbondata.hadoop.CarbonInputFormat.getS
> plits(CarbonInputFormat.java:269)         at org.apache.
> carbondata.spark.rdd.CarbonScanRDD.getPartitions(
> CarbonScanRDD.scala:81)  at org.apache.spark.rdd.RDD$$
> anonfun$partitions$2.apply(     RDD.scala:252)  at org.
> apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:
> 250)  at scala.Option.getOrElse(Option.scala:121)
>   at org.apache.spark.rdd.RDD.partitions(RDD.scala:2     50)
>    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(
> MapPartitionsRDD.scala:35)       at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.
> apache.spark.     rdd.RDD$$anonfun$partitions$2.apply(
> RDD.scala:250)  at scala.Option.getOrElse(Option.scala:
> 121)     at org.apache.spark.rdd.RDD.partitions(RDD.scala:
> 250)   at org.apache.spark.rdd.MapPar     titionsRDD.getPartitions(
> MapPartitionsRDD.scala:35)       at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:252)  at org.
> apache.spark.rdd.RDD$$anonfun$partitions$2.apply(     RDD.
> scala:250)  at scala.Option.getOrElse(Option.scala:121)
>   at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
> at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitio
> nsRDD.scala:35)       at org.apache.spark.rdd.RDD$$anonfun$
> partitions$2.apply(RDD.scala:252)  at org.apache.spark.rdd.
> RDD$$anonfun$partitions$2.apply(RDD.scala:250)  at
> scala.Option.getO     rElse(Option.scala:121)     at org.
> apache.spark.rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.rdd.
> MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
>    at org.apache.     spark.rdd.RDD$$anonfun$partitions$2.
> apply(RDD.scala:252)  at org.apache.spark.rdd.RDD$$anonfun$
> partitions$2.apply(RDD.scala:250)  at scala.Option.
> getOrElse(Option.scala:121)     at org.a     pache.spark.
> rdd.RDD.partitions(RDD.scala:250)   at org.apache.spark.
> ShuffleDependency.<init>(Dependency.scala:91)       at
> org.apache.spark.rdd.ShuffledRDD.getDependencies(
> ShuffledRDD.sc     ala:91)       at org.apache.spark.rdd.
> RDD$$anonfun$dependencies$2.apply(RDD.scala:239)
> at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(
> RDD.scala:237)        at scala.Opt     ion.getOrElse(
> Option.scala:121)     at org.apache.spark.rdd.RDD.
> dependencies(RDD.scala:237)         at org.apache.spark.
> scheduler.DAGScheduler.getShuffleDependencies(
> DAGScheduler.scala:4     24)       at org.apache.spark.
> scheduler.DAGScheduler.getOrCreateParentStages(
> DAGScheduler.scala:373)      at org.apache.spark.scheduler.DAGScheduler.
> createResultStage(DAGScheduler.scala:     360)    at org.apache.spark.
> scheduler.DAGScheduler.handleJobSubmitted(DAGScheduler.scala:838)   at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> doOnReceive(DAGScheduler.scala:     1613)         at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.
> onReceive(DAGScheduler.scala:1605)   at org.apache.spark.scheduler.
> DAGSchedulerEventProcessLoop.onReceive(DAGSche     duler.
> scala:1594)   at org.apache.spark.util.EventLoop$$anon$1.
> run(EventLoop.scala:48)      at org.apache.spark.scheduler.
> DAGScheduler.runJob(DAGScheduler.scala:628)
> at org.apache.     spark.SparkContext.runJob(
> SparkContext.scala:1925)        at org.apache.spark.SparkContext.runJob(
> SparkContext.scala:1938)        at org.apache.spark.SparkContext.runJob(
> SparkContext.sca     la:1951)        at org.apache.spark.
> SparkContext.runJob(SparkContext.scala:1965)
>   at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.
> scala:936)     at org.apache.spark.rdd.RD     DOperationScope$.withScope(
> RDDOperationScope.scala:151)       at org.apache.spark.rdd.
> RDDOperationScope$.withScope(RDDOperationScope.scala:112)
>      at org.apache.spark.rdd.RDD.withScop     e(RDD.scala:
> 362)    at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
>       at org.apache.spark.sql.execution.command.deleteExecution$.
> deleteDeltaExecution(IUDCommands.scala:606)  at org     .apache.spark.sql.
> execution.command.ProjectForDeleteCommand.run(
> IUDCommands.scala:98)     at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.sideEffectResult$lzycompute(
> comman     ds.scala:58)    at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.sideEffectResult(commands.
> scala:56)       at org.apache.spark.sql.execution.command.
> ExecutedCommandExec.doEx     ecute(commands.scala:74)
> at org.apache.spark.sql.execution.SparkPlan$$anonfun$
> execute$1.apply(SparkPlan.scala:114)       at org.
> apache.spark.sql.execution.SparkPlan$$anonfun$execute
>  $1.apply(SparkPlan.scala:114)       at org.apache.spark.
> sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(
> SparkPlan.scala:135)  at org.apache.spark.rdd.
> RDDOperationScope$.withScope(R     DDOperationScope.scala:
> 151)       at org.apache.spark.sql.execution.SparkPlan.
> executeQuery(SparkPlan.scala:132)   at org.apache.spark.
> sql.execution.SparkPlan.execute(SparkPlan.scala:113)
>            at org.apache.spark.sql.execution.QueryExecution.toRdd$
> lzycompute(QueryExecution.scala:92)      at org.apache.
> spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
>         at org.apache.spark.sql.Dataset.<init>(Dataset.
> scala:185)       at org.apache.spark.sql.Dataset$.
> ofRows(Dataset.scala:64)       at org.apache.spark.sql.
> SparkSession.sql(SparkSessio     n.scala:592)        at cn.
> ffcs.carbon.clienttouch.ProcessOracle6$.exeSqlWithRetry(
> ProcessOracle6.scala:564)         at cn.ffcs.carbon.
> clienttouch.ProcessOracle6$$anonfun$triggeredAppendB
> atch$2.apply(ProcessOracle6.scala:281)    at cn.ffcs.carbon.clienttouch.
> ProcessOracle6$$anonfun$triggeredAppendBatch$2.apply(
> ProcessOracle6.scala:277)    at scala.collection.IndexedSeqO
> ------------------------------
> yixu2001
>



--
Thanks
Sounak