vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579996496 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean Review comment: Yeah, good idea ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean Review comment: Yeah, good idea. Edited ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579996841 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), + AttributeReference("Trash Data Remaining", StringType, nullable = false)()) + } else if (!isDryRun && showStats) { Review comment: Yes, it was a redundant check, removed it. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ydvpankaj99 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783105607 retest this please ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580001530 ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -87,13 +104,51 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeScreenshot(carbonTable: CarbonTable): Long = { + val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + var size: Long = 0 + val segmentFileLocation = CarbonTablePath.getSegmentFilesLocation(carbonTable.getTablePath) + if (FileFactory.isFileExist(segmentFileLocation)) { + size += FileFactory.getDirectorySize(segmentFileLocation) + } + metadataDetails.foreach(oneLoad => + if (oneLoad.getVisibility.toBoolean) { + size += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, metadataDetails) + } + ) + size + } + + /** + * Method to handle the Clean files dry run operation + */ + def cleanFilesDryRunOperation ( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean, + showStats: Boolean): Seq[Long] = { + // get size freed from the trash folder + val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, + isDryRun = true, showStats) + // get size that will be deleted (MFD, COmpacted, Inprogress segments) + val expiredSegmentsSizeStats = dryRunOnExpiredSegments(carbonTable, isForceDelete, + cleanStaleInProgress) + Seq(trashFolderSizeStats.head + expiredSegmentsSizeStats.head, trashFolderSizeStats(1) + + expiredSegmentsSizeStats(1)) + } + + private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean, + isDryRun: Boolean, showStats: Boolean): Seq[Long] = { Review comment: done ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783109607 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12444/job/ApacheCarbon_PR_Builder_2.4.5/3734/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783110117 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12444/job/ApacheCarbonPRBuilder2.3/5499/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783125434 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12444/job/ApacheCarbonPRBuilder2.3/5500/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580011816 ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -72,11 +74,26 @@ object DataTrashManager { carbonDeleteSegmentLock = CarbonLockUtil.getLockObject(carbonTable .getAbsoluteTableIdentifier, LockUsage.DELETE_SEGMENT_LOCK, deleteSegmentErrorMsg) // step 1: check and clean trash folder - checkAndCleanTrashFolder(carbonTable, isForceDelete) + // trashFolderSizeStats(0) contains the size that is freed/or can be freed and + // trashFolderSizeStats(1) contains the size of remaining data in the trash folder + val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, + isDryRun = false, showStatistics) // step 2: move stale segments which are not exists in metadata into .Trash moveStaleSegmentsToTrash(carbonTable) // step 3: clean expired segments(MARKED_FOR_DELETE, Compacted, In Progress) - checkAndCleanExpiredSegments(carbonTable, isForceDelete, cleanStaleInProgress, partitionSpecs) + // Since calculating the the size before and after clean files can be a costly operation + // have exposed an option where user can change this behaviour. + if (showStatistics) { + val sizeBeforeCleaning = getSizeScreenshot(carbonTable) Review comment: *snapshot screenshot --> snapshot ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783127326 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12444/job/ApacheCarbon_PR_Builder_2.4.5/3735/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580012643 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: better to mention is it in MB/GB ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580012643 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: better to mention is it in MB/GB like Size Freed in MB : 512 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580013092 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: is it combining outside trash and inside trash ? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580013774 ########## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ########## @@ -466,6 +485,39 @@ class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED_DEFAULT) } + test("Test clean files after delete command") { + CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED, "true") + sql("drop table if exists cleantest") + sql( + """ + | CREATE TABLE cleantest (empname String, designation String, doj Timestamp, + | workgroupcategory int, workgroupcategoryname String, deptno int, deptname String, + | projectcode int, projectjoindate Timestamp, projectenddate Date,attendance int, + | utilization int,salary int, empno int) + | STORED AS carbondata + """.stripMargin) + sql( + s"""LOAD DATA local inpath '$resourcesPath/data.csv' INTO TABLE cleantest OPTIONS + |('DELIMITER'= ',', 'QUOTECHAR'= '"')""".stripMargin) + val table = CarbonEnv.getCarbonTable(None, "cleantest") (sqlContext.sparkSession) + sql("delete from cleantest where deptno='10'") + sql(s"""Delete from table cleantest where segment.id in(0)""") + + var dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true')").collect() + var cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) + dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true','force'='true')") + .collect() + cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('force'='true')").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) Review comment: can you add one validation by doing getSegmentSize() and compare it with clean files output. If some problem in statistics current validations may not catch it ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ajantha-bhat commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580013774 ########## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ########## @@ -466,6 +485,39 @@ class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED_DEFAULT) } + test("Test clean files after delete command") { + CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED, "true") + sql("drop table if exists cleantest") + sql( + """ + | CREATE TABLE cleantest (empname String, designation String, doj Timestamp, + | workgroupcategory int, workgroupcategoryname String, deptno int, deptname String, + | projectcode int, projectjoindate Timestamp, projectenddate Date,attendance int, + | utilization int,salary int, empno int) + | STORED AS carbondata + """.stripMargin) + sql( + s"""LOAD DATA local inpath '$resourcesPath/data.csv' INTO TABLE cleantest OPTIONS + |('DELIMITER'= ',', 'QUOTECHAR'= '"')""".stripMargin) + val table = CarbonEnv.getCarbonTable(None, "cleantest") (sqlContext.sparkSession) + sql("delete from cleantest where deptno='10'") + sql(s"""Delete from table cleantest where segment.id in(0)""") + + var dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true')").collect() + var cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) + dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true','force'='true')") + .collect() + cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('force'='true')").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) Review comment: can you add one validation by doing getSegmentSize() and compare it with clean files output ? If some problem in statistics current validations may not catch it ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ydvpankaj99 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783135423 retest this please ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580044175 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), Review comment: I am using convertBytesToReadable before printing it, in that it will add the necessary size value(Kb/Mb etc) in the result itself, so no need to add it in the Attribute Reference. Yes this is both inside and outside trash folder ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580044526 ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -72,11 +74,26 @@ object DataTrashManager { carbonDeleteSegmentLock = CarbonLockUtil.getLockObject(carbonTable .getAbsoluteTableIdentifier, LockUsage.DELETE_SEGMENT_LOCK, deleteSegmentErrorMsg) // step 1: check and clean trash folder - checkAndCleanTrashFolder(carbonTable, isForceDelete) + // trashFolderSizeStats(0) contains the size that is freed/or can be freed and + // trashFolderSizeStats(1) contains the size of remaining data in the trash folder + val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, + isDryRun = false, showStatistics) // step 2: move stale segments which are not exists in metadata into .Trash moveStaleSegmentsToTrash(carbonTable) // step 3: clean expired segments(MARKED_FOR_DELETE, Compacted, In Progress) - checkAndCleanExpiredSegments(carbonTable, isForceDelete, cleanStaleInProgress, partitionSpecs) + // Since calculating the the size before and after clean files can be a costly operation + // have exposed an option where user can change this behaviour. + if (showStatistics) { + val sizeBeforeCleaning = getSizeScreenshot(carbonTable) Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r580051926 ########## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ########## @@ -466,6 +485,39 @@ class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED_DEFAULT) } + test("Test clean files after delete command") { + CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_CLEAN_FILES_FORCE_ALLOWED, "true") + sql("drop table if exists cleantest") + sql( + """ + | CREATE TABLE cleantest (empname String, designation String, doj Timestamp, + | workgroupcategory int, workgroupcategoryname String, deptno int, deptname String, + | projectcode int, projectjoindate Timestamp, projectenddate Date,attendance int, + | utilization int,salary int, empno int) + | STORED AS carbondata + """.stripMargin) + sql( + s"""LOAD DATA local inpath '$resourcesPath/data.csv' INTO TABLE cleantest OPTIONS + |('DELIMITER'= ',', 'QUOTECHAR'= '"')""".stripMargin) + val table = CarbonEnv.getCarbonTable(None, "cleantest") (sqlContext.sparkSession) + sql("delete from cleantest where deptno='10'") + sql(s"""Delete from table cleantest where segment.id in(0)""") + + var dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true')").collect() + var cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) + dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('dryrun'='true','force'='true')") + .collect() + cleanFiles = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('force'='true')").collect() + assert(cleanFiles(0).get(0) == dryRun(0).get(0)) Review comment: done added ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783213702 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12444/job/ApacheCarbonPRBuilder2.3/5501/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-783213957 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12444/job/ApacheCarbon_PR_Builder_2.4.5/3736/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
Free forum by Nabble | Edit this page |