akashrn5 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579051553 ########## File path: docs/clean-files.md ########## @@ -64,4 +64,41 @@ The stale_inprogress option with force option will delete Marked for delete, Com ``` CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true') - ``` \ No newline at end of file + ``` +### DRY RUN OPTION +Clean files also support a dry run option which will let the user know how much space fill we freed +during the actual clean files operation. The dry run operation will not delete any data but will just give +size based statistics on the data which will be cleaned in clean files. Dry run operation will return two columns where the first will +show how much space will be freed by that clean files operation and the second column will show the +remaining stale data(data which can be deleted but has not yet expired as per the ```max.query.execution.time``` and ``` carbon.trash.retention.days``` values +). By default the value of ```dryrun``` option is ```false```. + +Dry Run Operation is supported with four types of commands: + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('dryrun'='true') + ``` + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('force'='true', 'dryrun'='true') + ``` + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true','dryrun'='true') + ``` + + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true','dryrun'='true') + ``` + +**NOTE**: + * Since the dry run operation will calculate size and will access File level API's, the operation can + be a costly and a time consuming operation in case of tables with large number of segments. + * When dry run is true, the statistics option will not matter. + +### SHOW STATISTICS +Clean files operation tells how much size is freed during that operation to the user. By default, the clean files operation +will show the size freed statistics. Since calculating and showing statistics can be a costly operation and reduce the performance of the +clean files operation, the user can disable that option by using ```statistics = false``` in the clean files options. + + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('statistics`='false') Review comment: ```suggestion CLEAN FILES FOR TABLE TABLE_NAME options('statistics'='false') ``` ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -87,13 +104,48 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeScreenshot(carbonTable: CarbonTable): Long = { + val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + var size: Long = FileFactory.getDirectorySize(CarbonTablePath.getSegmentFilesLocation( Review comment: ```suggestion var segmentFilesLocationSize: Long = FileFactory.getDirectorySize(CarbonTablePath.getSegmentFilesLocation( ``` ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +42,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", LongType, nullable = false)(), + AttributeReference("Trash Data Remaining", LongType, nullable = false)()) Review comment: please return the return values of size in readable format, if giving in MB, have UNIT in column header as (MB) ########## File path: docs/clean-files.md ########## @@ -64,4 +64,41 @@ The stale_inprogress option with force option will delete Marked for delete, Com ``` CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true') - ``` \ No newline at end of file + ``` +### DRY RUN OPTION +Clean files also support a dry run option which will let the user know how much space fill we freed Review comment: ```suggestion Clean files also support a dry run option which will let the user know how much space will we freed ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579070718 ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +42,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", LongType, nullable = false)(), + AttributeReference("Trash Data Remaining", LongType, nullable = false)()) Review comment: done, using, Using ByteUtil.convertByteToReadable, it will convert the result to string and add necessary unit to it, either KB or MR ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579082607 ########## File path: docs/clean-files.md ########## @@ -64,4 +64,41 @@ The stale_inprogress option with force option will delete Marked for delete, Com ``` CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true') - ``` \ No newline at end of file + ``` +### DRY RUN OPTION +Clean files also support a dry run option which will let the user know how much space fill we freed +during the actual clean files operation. The dry run operation will not delete any data but will just give +size based statistics on the data which will be cleaned in clean files. Dry run operation will return two columns where the first will +show how much space will be freed by that clean files operation and the second column will show the +remaining stale data(data which can be deleted but has not yet expired as per the ```max.query.execution.time``` and ``` carbon.trash.retention.days``` values +). By default the value of ```dryrun``` option is ```false```. + +Dry Run Operation is supported with four types of commands: + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('dryrun'='true') + ``` + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('force'='true', 'dryrun'='true') + ``` + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true','dryrun'='true') + ``` + + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true','dryrun'='true') + ``` + +**NOTE**: + * Since the dry run operation will calculate size and will access File level API's, the operation can + be a costly and a time consuming operation in case of tables with large number of segments. + * When dry run is true, the statistics option will not matter. + +### SHOW STATISTICS +Clean files operation tells how much size is freed during that operation to the user. By default, the clean files operation +will show the size freed statistics. Since calculating and showing statistics can be a costly operation and reduce the performance of the +clean files operation, the user can disable that option by using ```statistics = false``` in the clean files options. + + ``` + CLEAN FILES FOR TABLE TABLE_NAME options('statistics`='false') Review comment: done ########## File path: docs/clean-files.md ########## @@ -64,4 +64,41 @@ The stale_inprogress option with force option will delete Marked for delete, Com ``` CLEAN FILES FOR TABLE TABLE_NAME options('stale_inprogress'='true', 'force'='true') - ``` \ No newline at end of file + ``` +### DRY RUN OPTION +Clean files also support a dry run option which will let the user know how much space fill we freed Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579083355 ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -87,13 +104,48 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeScreenshot(carbonTable: CarbonTable): Long = { + val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + var size: Long = FileFactory.getDirectorySize(CarbonTablePath.getSegmentFilesLocation( Review comment: changed logic ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
akashrn5 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579089632 ########## File path: core/src/main/java/org/apache/carbondata/core/util/CleanFilesUtil.java ########## @@ -74,13 +74,19 @@ public static void cleanStaleSegments(CarbonTable carbonTable) // delete the segment file as well FileFactory.deleteFile(CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath(), staleSegmentFile)); + StringBuilder deletedFiles = new StringBuilder(); Review comment: same as above ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1133,17 +1133,23 @@ public static void deleteSegment(String tablePath, Segment segment, List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); + StringBuilder deletedFiles = new StringBuilder(); Review comment: do not use string builder, use java functional APIs like below `String firstThenLast2 = files .stream() .map(p -> file.getPath) .collect(Collectors.toStringJoiner(",")) .toString();` try this ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1895,6 +1895,11 @@ private CarbonCommonConstants() { */ public static final String COMMA = ","; + /** + * SINGLE SPACE + */ + public static final String SPACE = " "; Review comment: revert this ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579106425 ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1895,6 +1895,11 @@ private CarbonCommonConstants() { */ public static final String COMMA = ","; + /** + * SINGLE SPACE + */ + public static final String SPACE = " "; Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579107479 ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1133,17 +1133,23 @@ public static void deleteSegment(String tablePath, Segment segment, List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); + StringBuilder deletedFiles = new StringBuilder(); Review comment: used string.join with delimiter ########## File path: core/src/main/java/org/apache/carbondata/core/util/CleanFilesUtil.java ########## @@ -74,13 +74,19 @@ public static void cleanStaleSegments(CarbonTable carbonTable) // delete the segment file as well FileFactory.deleteFile(CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath(), staleSegmentFile)); + StringBuilder deletedFiles = new StringBuilder(); Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782051474 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5095/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782053003 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3336/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
akashrn5 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782063137 LGTM for test, need couple more reviews to merge ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782112467 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/5096/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782114580 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/3337/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
VenuReddy2103 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579243982 ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1133,17 +1133,23 @@ public static void deleteSegment(String tablePath, Segment segment, List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); + List<String> deletedFiles = new ArrayList<>(); Review comment: Instead of making this array list and join them irrespective of logging(info) is enabled or not. Probably deferring this string manipulations till the point logging is enabled is better? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782214356 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12444/job/ApacheCarbon_PR_Builder_2.4.5/3732/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA2 commented on pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#issuecomment-782214759 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12444/job/ApacheCarbonPRBuilder2.3/5496/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
VenuReddy2103 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579263541 ########## File path: core/src/main/java/org/apache/carbondata/core/util/TrashUtil.java ########## @@ -150,48 +151,92 @@ public static void copyFilesToTrash(List<String> filesToCopy, /** * The below method deletes timestamp subdirectories in the trash folder which have expired as - * per the user defined retention time + * per the user defined retention time. It return an array where the first element has the size + * freed from the trash folder and the second element has the remaining size in the trash folder */ - public static void deleteExpiredDataFromTrash(String tablePath) { + public static long[] deleteExpiredDataFromTrash(String tablePath, Boolean isDryRun, + Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); + long sizeFreed = 0; + long trashFolderSize = 0; // Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. try { if (trashFolder.isFileExist()) { + if (isDryRun || showStats) { + trashFolderSize = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } CarbonFile[] timestampFolderList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile timestampFolder : timestampFolderList) { // If the timeStamp at which the timeStamp subdirectory has expired as per the user // defined value, delete the complete timeStamp subdirectory - if (timestampFolder.isDirectory() && isTrashRetentionTimeoutExceeded(Long - .parseLong(timestampFolder.getName()))) { - FileFactory.deleteAllCarbonFilesOfDir(timestampFolder); - LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + timestampFolder + if (isTrashRetentionTimeoutExceeded(Long.parseLong(timestampFolder.getName()))) { + if (timestampFolder.isDirectory()) { Review comment: would `if (timestampFolder.isDirectory()) ` ever be false ? ########## File path: core/src/main/java/org/apache/carbondata/core/util/TrashUtil.java ########## @@ -150,48 +151,92 @@ public static void copyFilesToTrash(List<String> filesToCopy, /** * The below method deletes timestamp subdirectories in the trash folder which have expired as - * per the user defined retention time + * per the user defined retention time. It return an array where the first element has the size + * freed from the trash folder and the second element has the remaining size in the trash folder */ - public static void deleteExpiredDataFromTrash(String tablePath) { + public static long[] deleteExpiredDataFromTrash(String tablePath, Boolean isDryRun, + Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); + long sizeFreed = 0; + long trashFolderSize = 0; // Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. try { if (trashFolder.isFileExist()) { + if (isDryRun || showStats) { + trashFolderSize = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } CarbonFile[] timestampFolderList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile timestampFolder : timestampFolderList) { // If the timeStamp at which the timeStamp subdirectory has expired as per the user // defined value, delete the complete timeStamp subdirectory - if (timestampFolder.isDirectory() && isTrashRetentionTimeoutExceeded(Long - .parseLong(timestampFolder.getName()))) { - FileFactory.deleteAllCarbonFilesOfDir(timestampFolder); - LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + timestampFolder + if (isTrashRetentionTimeoutExceeded(Long.parseLong(timestampFolder.getName()))) { + if (timestampFolder.isDirectory()) { + // only calculate size in case of dry run or in case clean files is with show stats + if (isDryRun || showStats) { + sizeFreed += FileFactory.getDirectorySize(timestampFolder.getAbsolutePath()); + } + filesToDelete.add(timestampFolder); + } + } + } + if (!isDryRun) { + for (CarbonFile carbonFile : filesToDelete) { + LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + carbonFile .getAbsolutePath()); + FileFactory.deleteAllCarbonFilesOfDir(carbonFile); } } } } catch (IOException e) { LOGGER.error("Error during deleting expired timestamp folder from the trash folder", e); } + return new long[] {sizeFreed, trashFolderSize - sizeFreed}; } /** * The below method deletes all the files and folders in the trash folder of a carbon table. + * Returns an array in which the first element contains the size freed in case of clean files + * operation or size that can be freed in case of dry run and the second element contains the + * remaining size. */ - public static void emptyTrash(String tablePath) { + public static long[] emptyTrash(String tablePath, Boolean isDryRun, Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); // if the trash folder exists delete the contents of the trash folder + long sizeFreed = 0; + long[] sizeStatistics = new long[]{0, 0}; try { if (trashFolder.isFileExist()) { CarbonFile[] carbonFileList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile carbonFile : carbonFileList) { - FileFactory.deleteAllCarbonFilesOfDir(carbonFile); + //Only calculate size when it is dry run operation or when show statistics is + // true with actual operation + if (isDryRun || showStats) { + sizeFreed += FileFactory.getDirectorySize(carbonFile.getAbsolutePath()); + } + filesToDelete.add(carbonFile); + } + sizeStatistics[0] = sizeFreed; + if (!isDryRun) { + for (CarbonFile carbonFile : filesToDelete) { + FileFactory.deleteAllCarbonFilesOfDir(carbonFile); + } + LOGGER.info("Trash Folder has been emptied for table: " + tablePath); + if (showStats) { + sizeStatistics[1] = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } + } else { + sizeStatistics[1] = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()) - Review comment: need sizeStatistics[1] when isDryRun is true and showStats is false ? ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { + var sizeFreed: Long = 0 + var trashSizeRemaining: Long = 0 + val loadMetadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + if (SegmentStatusManager.isLoadDeletionRequired(loadMetadataDetails)) { + loadMetadataDetails.foreach { oneLoad => + val segmentFilePath = CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath, + oneLoad.getSegmentFile) + if (DeleteLoadFolders.canDeleteThisLoad(oneLoad, isForceDelete, cleanStaleInProgress)) { + // No need to consider physical data for external segments, only consider metadata. + if (oneLoad.getPath() == null || oneLoad.getPath().equalsIgnoreCase("NA")) { + sizeFreed += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, loadMetadataDetails) + } + sizeFreed += FileFactory.getCarbonFile(segmentFilePath).getSize + } else { + if (SegmentStatusManager.isExpiredSegment(oneLoad, carbonTable + .getAbsoluteTableIdentifier)) { + trashSizeRemaining += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, + loadMetadataDetails) + trashSizeRemaining += FileFactory.getCarbonFile(segmentFilePath).getSize + } + } + } + } + Seq(sizeFreed, trashSizeRemaining) + } + + /** + * calculates the segment size based of a segment + */ + def calculateSegmentSizeForOneLoad( carbonTable: CarbonTable, oneLoad: LoadMetadataDetails, + loadMetadataDetails: Array[LoadMetadataDetails]) : Long = { + var size : Long = 0 + if (oneLoad.getDataSize!= null && !oneLoad.getDataSize.isEmpty) { Review comment: probably space missed before `!=`. same at line 223. ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -87,13 +104,51 @@ object DataTrashManager { } } - private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean): Unit = { + /** + * Checks the size of the segment files as well as datafiles, this method is used before and after + * clean files operation to check how much space is actually freed, during the operation. + */ + def getSizeScreenshot(carbonTable: CarbonTable): Long = { + val metadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + var size: Long = 0 + val segmentFileLocation = CarbonTablePath.getSegmentFilesLocation(carbonTable.getTablePath) + if (FileFactory.isFileExist(segmentFileLocation)) { + size += FileFactory.getDirectorySize(segmentFileLocation) + } + metadataDetails.foreach(oneLoad => + if (oneLoad.getVisibility.toBoolean) { + size += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, metadataDetails) + } + ) + size + } + + /** + * Method to handle the Clean files dry run operation + */ + def cleanFilesDryRunOperation ( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean, + showStats: Boolean): Seq[Long] = { + // get size freed from the trash folder + val trashFolderSizeStats = checkAndCleanTrashFolder(carbonTable, isForceDelete, + isDryRun = true, showStats) + // get size that will be deleted (MFD, COmpacted, Inprogress segments) + val expiredSegmentsSizeStats = dryRunOnExpiredSegments(carbonTable, isForceDelete, + cleanStaleInProgress) + Seq(trashFolderSizeStats.head + expiredSegmentsSizeStats.head, trashFolderSizeStats(1) + + expiredSegmentsSizeStats(1)) + } + + private def checkAndCleanTrashFolder(carbonTable: CarbonTable, isForceDelete: Boolean, + isDryRun: Boolean, showStats: Boolean): Seq[Long] = { Review comment: Returning array of 2(freed size, remaining size). We return the tuple is more meaningful and readable in the scala? ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { Review comment: same as above comment regd. return type ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1161,10 +1167,12 @@ private static void deletePhysicalPartition(List<PartitionSpec> partitionSpecs, boolean exists = pathExistsInPartitionSpec(partitionSpecs, location); if (!exists) { FileFactory.deleteAllCarbonFilesOfDir(FileFactory.getCarbonFile(location.toString())); + LOGGER.info("Deleted the mergeindex file: " + location.toString()); Review comment: This could be index or merge index? If so, how about we add `LOGGER.info("Deleting files:)` before loop and inside loop just have `LOGGER.info(location.toString());` ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean + if (isInternalCleanCall) { + showStats = false + } + + override def output: Seq[AttributeReference] = { + if (isDryRun) { + // dry run operation + Seq( + AttributeReference("Size Freed", StringType, nullable = false)(), + AttributeReference("Trash Data Remaining", StringType, nullable = false)()) + } else if (!isDryRun && showStats) { Review comment: !isDryRun would always evaluates to true. ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { + var sizeFreed: Long = 0 + var trashSizeRemaining: Long = 0 + val loadMetadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + if (SegmentStatusManager.isLoadDeletionRequired(loadMetadataDetails)) { + loadMetadataDetails.foreach { oneLoad => + val segmentFilePath = CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath, + oneLoad.getSegmentFile) + if (DeleteLoadFolders.canDeleteThisLoad(oneLoad, isForceDelete, cleanStaleInProgress)) { + // No need to consider physical data for external segments, only consider metadata. + if (oneLoad.getPath() == null || oneLoad.getPath().equalsIgnoreCase("NA")) { + sizeFreed += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, loadMetadataDetails) + } + sizeFreed += FileFactory.getCarbonFile(segmentFilePath).getSize + } else { + if (SegmentStatusManager.isExpiredSegment(oneLoad, carbonTable + .getAbsoluteTableIdentifier)) { + trashSizeRemaining += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, + loadMetadataDetails) + trashSizeRemaining += FileFactory.getCarbonFile(segmentFilePath).getSize + } + } + } + } + Seq(sizeFreed, trashSizeRemaining) + } + + /** + * calculates the segment size based of a segment + */ + def calculateSegmentSizeForOneLoad( carbonTable: CarbonTable, oneLoad: LoadMetadataDetails, + loadMetadataDetails: Array[LoadMetadataDetails]) : Long = { + var size : Long = 0 + if (oneLoad.getDataSize!= null && !oneLoad.getDataSize.isEmpty) { Review comment: you would want to use !StringUtils.isEmpty(oneLoad.getDataSize) ? can check similar at other place ########## File path: integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonCleanFilesCommand.scala ########## @@ -41,6 +43,26 @@ case class CarbonCleanFilesCommand( extends DataCommand { val LOGGER: Logger = LogServiceFactory.getLogService(this.getClass.getCanonicalName) + val isDryRun: Boolean = options.getOrElse("dryrun", "false").toBoolean + var showStats: Boolean = options.getOrElse("statistics", "true").toBoolean Review comment: showStats can still be immutable. You might want to do something like this - `val showStats: Boolean = if (isInternalCleanCall) { false } else { options.getOrElse("statistics", "true").toBoolean }` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579995522 ########## File path: core/src/main/java/org/apache/carbondata/core/util/TrashUtil.java ########## @@ -150,48 +151,92 @@ public static void copyFilesToTrash(List<String> filesToCopy, /** * The below method deletes timestamp subdirectories in the trash folder which have expired as - * per the user defined retention time + * per the user defined retention time. It return an array where the first element has the size + * freed from the trash folder and the second element has the remaining size in the trash folder */ - public static void deleteExpiredDataFromTrash(String tablePath) { + public static long[] deleteExpiredDataFromTrash(String tablePath, Boolean isDryRun, + Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); + long sizeFreed = 0; + long trashFolderSize = 0; // Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. try { if (trashFolder.isFileExist()) { + if (isDryRun || showStats) { + trashFolderSize = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } CarbonFile[] timestampFolderList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile timestampFolder : timestampFolderList) { // If the timeStamp at which the timeStamp subdirectory has expired as per the user // defined value, delete the complete timeStamp subdirectory - if (timestampFolder.isDirectory() && isTrashRetentionTimeoutExceeded(Long - .parseLong(timestampFolder.getName()))) { - FileFactory.deleteAllCarbonFilesOfDir(timestampFolder); - LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + timestampFolder + if (isTrashRetentionTimeoutExceeded(Long.parseLong(timestampFolder.getName()))) { + if (timestampFolder.isDirectory()) { + // only calculate size in case of dry run or in case clean files is with show stats + if (isDryRun || showStats) { + sizeFreed += FileFactory.getDirectorySize(timestampFolder.getAbsolutePath()); + } + filesToDelete.add(timestampFolder); + } + } + } + if (!isDryRun) { + for (CarbonFile carbonFile : filesToDelete) { + LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + carbonFile .getAbsolutePath()); + FileFactory.deleteAllCarbonFilesOfDir(carbonFile); } } } } catch (IOException e) { LOGGER.error("Error during deleting expired timestamp folder from the trash folder", e); } + return new long[] {sizeFreed, trashFolderSize - sizeFreed}; } /** * The below method deletes all the files and folders in the trash folder of a carbon table. + * Returns an array in which the first element contains the size freed in case of clean files + * operation or size that can be freed in case of dry run and the second element contains the + * remaining size. */ - public static void emptyTrash(String tablePath) { + public static long[] emptyTrash(String tablePath, Boolean isDryRun, Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); // if the trash folder exists delete the contents of the trash folder + long sizeFreed = 0; + long[] sizeStatistics = new long[]{0, 0}; try { if (trashFolder.isFileExist()) { CarbonFile[] carbonFileList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile carbonFile : carbonFileList) { - FileFactory.deleteAllCarbonFilesOfDir(carbonFile); + //Only calculate size when it is dry run operation or when show statistics is + // true with actual operation + if (isDryRun || showStats) { + sizeFreed += FileFactory.getDirectorySize(carbonFile.getAbsolutePath()); + } + filesToDelete.add(carbonFile); + } + sizeStatistics[0] = sizeFreed; + if (!isDryRun) { + for (CarbonFile carbonFile : filesToDelete) { + FileFactory.deleteAllCarbonFilesOfDir(carbonFile); + } + LOGGER.info("Trash Folder has been emptied for table: " + tablePath); + if (showStats) { + sizeStatistics[1] = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } + } else { + sizeStatistics[1] = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()) - Review comment: when dryRun is true, showStats is not taken into account ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579995710 ########## File path: core/src/main/java/org/apache/carbondata/core/util/TrashUtil.java ########## @@ -150,48 +151,92 @@ public static void copyFilesToTrash(List<String> filesToCopy, /** * The below method deletes timestamp subdirectories in the trash folder which have expired as - * per the user defined retention time + * per the user defined retention time. It return an array where the first element has the size + * freed from the trash folder and the second element has the remaining size in the trash folder */ - public static void deleteExpiredDataFromTrash(String tablePath) { + public static long[] deleteExpiredDataFromTrash(String tablePath, Boolean isDryRun, + Boolean showStats) { CarbonFile trashFolder = FileFactory.getCarbonFile(CarbonTablePath .getTrashFolderPath(tablePath)); + long sizeFreed = 0; + long trashFolderSize = 0; // Deleting the timestamp based subdirectories in the trashfolder by the given timestamp. try { if (trashFolder.isFileExist()) { + if (isDryRun || showStats) { + trashFolderSize = FileFactory.getDirectorySize(trashFolder.getAbsolutePath()); + } CarbonFile[] timestampFolderList = trashFolder.listFiles(); + List<CarbonFile> filesToDelete = new ArrayList<>(); for (CarbonFile timestampFolder : timestampFolderList) { // If the timeStamp at which the timeStamp subdirectory has expired as per the user // defined value, delete the complete timeStamp subdirectory - if (timestampFolder.isDirectory() && isTrashRetentionTimeoutExceeded(Long - .parseLong(timestampFolder.getName()))) { - FileFactory.deleteAllCarbonFilesOfDir(timestampFolder); - LOGGER.info("Timestamp subfolder from the Trash folder deleted: " + timestampFolder + if (isTrashRetentionTimeoutExceeded(Long.parseLong(timestampFolder.getName()))) { + if (timestampFolder.isDirectory()) { Review comment: technically it won't, just added a fail safe in case someone changes something in future. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579995778 ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1161,10 +1167,12 @@ private static void deletePhysicalPartition(List<PartitionSpec> partitionSpecs, boolean exists = pathExistsInPartitionSpec(partitionSpecs, location); if (!exists) { FileFactory.deleteAllCarbonFilesOfDir(FileFactory.getCarbonFile(location.toString())); + LOGGER.info("Deleted the mergeindex file: " + location.toString()); Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #4072: URL: https://github.com/apache/carbondata/pull/4072#discussion_r579996248 ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { + var sizeFreed: Long = 0 + var trashSizeRemaining: Long = 0 + val loadMetadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + if (SegmentStatusManager.isLoadDeletionRequired(loadMetadataDetails)) { + loadMetadataDetails.foreach { oneLoad => + val segmentFilePath = CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath, + oneLoad.getSegmentFile) + if (DeleteLoadFolders.canDeleteThisLoad(oneLoad, isForceDelete, cleanStaleInProgress)) { + // No need to consider physical data for external segments, only consider metadata. + if (oneLoad.getPath() == null || oneLoad.getPath().equalsIgnoreCase("NA")) { + sizeFreed += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, loadMetadataDetails) + } + sizeFreed += FileFactory.getCarbonFile(segmentFilePath).getSize + } else { + if (SegmentStatusManager.isExpiredSegment(oneLoad, carbonTable + .getAbsoluteTableIdentifier)) { + trashSizeRemaining += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, + loadMetadataDetails) + trashSizeRemaining += FileFactory.getCarbonFile(segmentFilePath).getSize + } + } + } + } + Seq(sizeFreed, trashSizeRemaining) + } + + /** + * calculates the segment size based of a segment + */ + def calculateSegmentSizeForOneLoad( carbonTable: CarbonTable, oneLoad: LoadMetadataDetails, + loadMetadataDetails: Array[LoadMetadataDetails]) : Long = { + var size : Long = 0 + if (oneLoad.getDataSize!= null && !oneLoad.getDataSize.isEmpty) { Review comment: Okay ########## File path: integration/spark/src/main/scala/org/apache/carbondata/trash/DataTrashManager.scala ########## @@ -121,6 +176,78 @@ object DataTrashManager { } } + /** + * Does Clean files dry run operation on the expired segments. Returns the size freed + * during that clean files operation and also shows the remaining trash size, which can be + * cleaned after those segments are expired + */ + private def dryRunOnExpiredSegments( + carbonTable: CarbonTable, + isForceDelete: Boolean, + cleanStaleInProgress: Boolean): Seq[Long] = { + var sizeFreed: Long = 0 + var trashSizeRemaining: Long = 0 + val loadMetadataDetails = SegmentStatusManager.readLoadMetadata(carbonTable.getMetadataPath) + if (SegmentStatusManager.isLoadDeletionRequired(loadMetadataDetails)) { + loadMetadataDetails.foreach { oneLoad => + val segmentFilePath = CarbonTablePath.getSegmentFilePath(carbonTable.getTablePath, + oneLoad.getSegmentFile) + if (DeleteLoadFolders.canDeleteThisLoad(oneLoad, isForceDelete, cleanStaleInProgress)) { + // No need to consider physical data for external segments, only consider metadata. + if (oneLoad.getPath() == null || oneLoad.getPath().equalsIgnoreCase("NA")) { + sizeFreed += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, loadMetadataDetails) + } + sizeFreed += FileFactory.getCarbonFile(segmentFilePath).getSize + } else { + if (SegmentStatusManager.isExpiredSegment(oneLoad, carbonTable + .getAbsoluteTableIdentifier)) { + trashSizeRemaining += calculateSegmentSizeForOneLoad(carbonTable, oneLoad, + loadMetadataDetails) + trashSizeRemaining += FileFactory.getCarbonFile(segmentFilePath).getSize + } + } + } + } + Seq(sizeFreed, trashSizeRemaining) + } + + /** + * calculates the segment size based of a segment + */ + def calculateSegmentSizeForOneLoad( carbonTable: CarbonTable, oneLoad: LoadMetadataDetails, + loadMetadataDetails: Array[LoadMetadataDetails]) : Long = { + var size : Long = 0 + if (oneLoad.getDataSize!= null && !oneLoad.getDataSize.isEmpty) { Review comment: yeah ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
Free forum by Nabble | Edit this page |