chetandb commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509154365 ########## File path: integration/spark/src/test/scala/org/apache/carbondata/spark/testsuite/cleanfiles/TestCleanFileCommand.scala ########## @@ -0,0 +1,484 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.carbondata.spark.testsuite.cleanfiles + +import java.io.{File, PrintWriter} +import java.util +import java.util.List + +import org.apache.carbondata.cleanfiles.CleanFilesUtil +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.datastore.filesystem.CarbonFile +import org.apache.carbondata.core.datastore.impl.FileFactory +import org.apache.carbondata.core.util.CarbonUtil +import org.apache.spark.sql.{CarbonEnv, Row} +import org.apache.spark.sql.test.util.QueryTest +import org.scalatest.BeforeAndAfterAll + +import scala.io.Source + +class TestCleanFileCommand extends QueryTest with BeforeAndAfterAll { + + var count = 0 + + test("clean up table and test trash folder with In Progress segments") { + sql("""DROP TABLE IF EXISTS CLEANTEST""") + sql("""DROP TABLE IF EXISTS CLEANTEST1""") + sql( + """ + | CREATE TABLE cleantest (name String, id Int) + | STORED AS carbondata + """.stripMargin) + sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") + sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") + sql(s"""INSERT INTO CLEANTEST SELECT "abc", 1""") + // run a select query before deletion + checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + + val path = CarbonEnv.getCarbonTable(Some("default"), "cleantest")(sqlContext.sparkSession) + .getTablePath + val tableStatusFilePath = path + CarbonCommonConstants.FILE_SEPARATOR + "Metadata" + + CarbonCommonConstants.FILE_SEPARATOR + "tableStatus" + editTableStatusFile(path) + val trashFolderPath = path + CarbonCommonConstants.FILE_SEPARATOR + + CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + + assert(!FileFactory.isFileExist(trashFolderPath)) + val dryRun = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() + // dry run shows 3 segments to move to trash + assert(dryRun == 3) + + sql(s"CLEAN FILES FOR TABLE cleantest").show + + checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(0))) + assert(FileFactory.isFileExist(trashFolderPath)) + var list = getFileCountInTrashFolder(trashFolderPath) + assert(list == 6) + + val dryRun1 = sql(s"CLEAN FILES FOR TABLE cleantest OPTIONS('isDryRun'='true')").count() + sql(s"CLEAN FILES FOR TABLE cleantest").show + + count = 0 + list = getFileCountInTrashFolder(trashFolderPath) + // no carbondata file is added to the trash + assert(list == 6) + + + val timeStamp = getTimestampFolderName(trashFolderPath) + + // recovering data from trash folder + sql( + """ + | CREATE TABLE cleantest1 (name String, id Int) + | STORED AS carbondata + """.stripMargin) + + val segment0Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '0' + val segment1Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '1' + val segment2Path = trashFolderPath + CarbonCommonConstants.FILE_SEPARATOR + timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + '2' + + sql(s"alter table cleantest1 add segment options('path'='$segment0Path'," + + s"'format'='carbon')").show() + sql(s"alter table cleantest1 add segment options('path'='$segment1Path'," + + s"'format'='carbon')").show() + sql(s"alter table cleantest1 add segment options('path'='$segment2Path'," + + s"'format'='carbon')").show() + sql(s"""INSERT INTO CLEANTEST SELECT * from cleantest1""") + + // test after recovering data from trash + checkAnswer(sql(s"""select count(*) from cleantest"""), + Seq(Row(3))) + + sql(s"CLEAN FILES FOR TABLE cleantest options('force'='true')").show + count = 0 + list = getFileCountInTrashFolder(trashFolderPath) + // no carbondata file is added to the trash + assert(list == 0) + sql("""DROP TABLE IF EXISTS CLEANTEST""") + sql("""DROP TABLE IF EXISTS CLEANTEST1""") + } + + + test("clean up maintable table and test trash folder with SI with IN PROGRESS segments") { + + sql("""DROP TABLE IF EXISTS CLEANTEST_WITHSI""") + sql("""DROP TABLE IF EXISTS CLEANTEST1""") + sql( + """ + | CREATE TABLE CLEANTEST_WITHSI (id Int, name String, add String ) + | STORED AS carbondata + """.stripMargin) + sql(s"""INSERT INTO CLEANTEST_WITHSI SELECT 1,"abc","def"""") + sql(s"""INSERT INTO CLEANTEST_WITHSI SELECT 2, "abc","def"""") + sql(s"""INSERT INTO CLEANTEST_WITHSI SELECT 3, "abc","def"""") + + sql(s"""CREATE INDEX SI_CLEANTEST on cleantest_withSI(add) as 'carbondata' """) + + checkAnswer(sql(s"""select count(*) from cleantest_withSI"""), + Seq(Row(3))) + checkAnswer(sql(s"""select count(*) from si_cleantest"""), + Seq(Row(3))) + + val mainTablePath = CarbonEnv.getCarbonTable(Some("default"), "cleantest_withsi")(sqlContext + .sparkSession).getTablePath + editTableStatusFile(mainTablePath) + val mainTableTrashFolderPath = mainTablePath + CarbonCommonConstants.FILE_SEPARATOR + + CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + + val siTablePath = CarbonEnv.getCarbonTable(Some("default"), "si_cleantest")(sqlContext + .sparkSession).getTablePath + editTableStatusFile(siTablePath) + val siTableTrashFolderPath = siTablePath + CarbonCommonConstants.FILE_SEPARATOR + + CarbonCommonConstants.CARBON_TRASH_FOLDER_NAME + + assert(!FileFactory.isFileExist(mainTableTrashFolderPath)) + assert(!FileFactory.isFileExist(siTableTrashFolderPath)) + + val dryRun = sql(s"CLEAN FILES FOR TABLE cleantest_withsi OPTIONS('isDryRun'='true')").count() + // dry run shows 6 segments to move to trash. 3 for main table, 3 for si table + assert(dryRun == 6) + + sql(s"CLEAN FILES FOR TABLE CLEANTEST_WITHSI").show() + Review comment: Add scenario of drop index(SI) and then clean files on main and SI table. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
chetandb commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509157185 ########## File path: docs/cleanfiles.md ########## @@ -0,0 +1,78 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to you under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES ON TABLE TABLE_NAME Review comment: The test cases in TestCleanFileCommand are having syntax "clean files for table tablename" whereas here its mentioned as "clean files on table tablename" ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
chetandb commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509160183 ########## File path: docs/cleanfiles.md ########## @@ -0,0 +1,78 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to you under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + + +## CLEAN FILES + +Clean files command is used to remove the Compacted, Marked For Delete ,In Progress which are stale and Partial(Segments which are missing from the table status file but their data is present) + segments from the store. + + Clean Files Command + ``` + CLEAN FILES ON TABLE TABLE_NAME + ``` + + +### TRASH FOLDER + + Carbondata supports a Trash Folder which is used as a redundant folder where all the unnecessary files and folders are moved to during clean files operation. + This trash folder is mantained inside the table path. It is a hidden folder(.Trash). The segments that are moved to the trash folder are mantained under a timestamp + subfolder(timestamp at which clean files operation is called). This helps the user to list down segments by timestamp. By default all the timestamp sub-directory have an expiration + time of (3 days since that timestamp) and it can be configured by the user using the following carbon property + ``` + carbon.trash.expiration.time = "Number of days" + ``` + Once the timestamp subdirectory is expired as per the configured expiration day value, the subdirectory is deleted from the trash folder in the subsequent clean files command. + + + + +### DRY RUN + Support for dry run is provided before the actual clean files operation. This dry run operation will list down all the segments which are going to be manipulated during + the clean files operation. The dry run result will show the current location of the segment(it can be in FACT folder, Partition folder or trash folder) and where that segment + will be moved(to the trash folder or deleted from store) once the actual operation will be called. + + + ``` + CLEAN FILES ON TABLE TABLE_NAME options('dry_run'='true') + ``` + +### FORCE DELETE TRASH +The force option with clean files command deletes all the files and folders from the trash folder. + + ``` + CLEAN FILES ON TABLE TABLE_NAME options('force'='true') + ``` + +### DATA RECOVERY FROM THE TRASH FOLDER + +The segments from can be recovered from the trash folder by creating an external table from the desired segment location Review comment: Change "The segments from" to "The segments" ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
PurujitChaugule commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509201958 ########## File path: docs/dml-of-carbondata.md ########## @@ -552,3 +553,50 @@ CarbonData DML statements are documented here,which includes: ``` CLEAN FILES FOR TABLE carbon_table ``` + +## CLEAN FILES + + Clean files command is used to remove the Compacted and Marked + For Delete Segments from the store. Carbondata also supports Trash + Folder where all the stale data is moved to after clean files + is called + + There are several types of compaction + + ``` + CLEAN FILES ON TABLE TableName + ``` Review comment: Clean files syntax needs to be changed from "clean files on table tablename" to "clean files for table tablename" as testcases mentioned use the above ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-713591443 retest this please ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714329770 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2865/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714331613 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/4620/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714374826 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2866/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714375735 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/4621/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
ydvpankaj99 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714376914 retest this please ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
QiangCai commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r509855538 ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1427,6 +1427,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final String MICROSECONDS_IN_A_DAY = "86400000"; + + /** + * this is the user defined time(in days), when a specific timestamp subdirectory in + * trash folder will expire + */ + @CarbonProperty + public static final String TRASH_EXPIRATION_TIME = "carbon.trash.expiration.time"; Review comment: how about carbon.trash.expiration.days ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + Review comment: how about change method name to TrashUtil.copyFileToTrash? ########## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ########## @@ -138,8 +143,19 @@ public boolean accept(CarbonFile file) { if (filesToBeDeleted.length == 0) { status = true; } else { - for (CarbonFile eachFile : filesToBeDeleted) { + // If the file to be deleted is a carbondata file, index file, index merge file + // or a delta file, copy that file to the trash folder. + if ((eachFile.getName().endsWith(CarbonCommonConstants.FACT_FILE_EXT) || Review comment: better to check by LoadMetadataDetails level, not by file level ########## File path: core/src/main/java/org/apache/carbondata/core/util/DeleteLoadFolders.java ########## @@ -192,11 +208,17 @@ private static boolean checkIfLoadCanBeDeleted(LoadMetadataDetails oneLoad, } private static boolean checkIfLoadCanBeDeletedPhysically(LoadMetadataDetails oneLoad, - boolean isForceDelete) { + boolean isForceDelete, AbsoluteTableIdentifier absoluteTableIdentifier) { // Check if the segment is added externally and path is set then do not delete it if ((SegmentStatus.MARKED_FOR_DELETE == oneLoad.getSegmentStatus() - || SegmentStatus.COMPACTED == oneLoad.getSegmentStatus()) && (oneLoad.getPath() == null + || SegmentStatus.COMPACTED == oneLoad.getSegmentStatus() || SegmentStatus + .INSERT_IN_PROGRESS == oneLoad.getSegmentStatus()) && (oneLoad.getPath() == null Review comment: better to keep insert_in_progress segment at origin place for a period (for example 3 days?) also. after expiration days, move it to trash or delete directly. ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1427,6 +1427,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final String MICROSECONDS_IN_A_DAY = "86400000"; Review comment: public static final long MILLIS_SECONDS_IN_A_DAY = TimeUnit.DAYS.toMillis(1); ########## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ########## @@ -53,12 +52,14 @@ private static final Logger LOGGER = LogServiceFactory.getLogService(CarbonLoaderUtil.class.getName()); + private static List<CarbonFile> filesInTrashFolder = new ArrayList<CarbonFile>(); + /** * delete folder which metadata no exist in tablestatus * this method don't check tablestatus history. */ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, Review comment: this method should move to clean files. another pr is also changing it. https://github.com/apache/carbondata/pull/3935 @Pickupolddriver ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + Review comment: how about do like this: for normal table: timestamp/Fact/Part0/Segment_# for partitable table: timestamp/partition_folder ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { Review comment: better to add another moveSegmentToTrash without change this deleteSegment so much, it will need checking segment status once, but many times. ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + entry.getKey().substring( + tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } + } + // add the file to the filesToDelete map to delete it after the complete segment + // has been copied. + filesToDelete.add(entry.getKey()); for (String file : entry.getValue()) { String[] deltaFilePaths = updateStatusManager.getDeleteDeltaFilePath(file, segment.getSegmentNo()); for (String deltaFilePath : deltaFilePaths) { - FileFactory.deleteFile(deltaFilePath); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + deltaFilePath.substring( + tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } + } + filesToDelete.add(deltaFilePath); + } + // If the file to be deleted is a carbondata file, copy that file to the trash folder. + if (file.endsWith(CarbonCommonConstants.FACT_FILE_EXT) && segmentStatus == + SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + file.substring(tablePath + .length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } } - FileFactory.deleteFile(file); + filesToDelete.add(file); } } - deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath); + LoadMetadataDetails loadMetaDataDetail = new LoadMetadataDetails(); + loadMetaDataDetail.setSegmentStatus(segmentStatus); + loadMetaDataDetail.setLoadName(segment.getSegmentNo()); + deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath, + loadMetaDataDetail, filesToDelete, timeStamp); String segmentFilePath = CarbonTablePath.getSegmentFilePath(tablePath, segment.getSegmentFileName()); // Deletes the physical segment file FileFactory.deleteFile(segmentFilePath); Review comment: do we need to move this segment file to trash? ########## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ########## @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void moveDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { + String trashFolderPath = carbonTablePath + CarbonCommonConstants.FILE_SEPARATOR + Review comment: how about extract the code to CarbonTablePath.getTrashFolder? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714433660 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2871/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714482013 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/4627/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714670301 Build Success with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2881/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714702324 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/4637/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510647134 ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1427,6 +1427,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final String MICROSECONDS_IN_A_DAY = "86400000"; Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510647379 ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + entry.getKey().substring( + tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } + } + // add the file to the filesToDelete map to delete it after the complete segment + // has been copied. + filesToDelete.add(entry.getKey()); for (String file : entry.getValue()) { String[] deltaFilePaths = updateStatusManager.getDeleteDeltaFilePath(file, segment.getSegmentNo()); for (String deltaFilePath : deltaFilePaths) { - FileFactory.deleteFile(deltaFilePath); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, deltaFilePath, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + deltaFilePath.substring( + tablePath.length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } + } + filesToDelete.add(deltaFilePath); + } + // If the file to be deleted is a carbondata file, copy that file to the trash folder. + if (file.endsWith(CarbonCommonConstants.FACT_FILE_EXT) && segmentStatus == + SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo()); + } else { + TrashUtil.moveDataToTrashFolderByFile(tablePath, file, timeStamp + + CarbonCommonConstants.FILE_SEPARATOR + CarbonCommonConstants.LOAD_FOLDER + segment + .getSegmentNo() + CarbonCommonConstants.FILE_SEPARATOR + file.substring(tablePath + .length() + 1).split(CarbonCommonConstants.FILE_SEPARATOR)[0]); + } } - FileFactory.deleteFile(file); + filesToDelete.add(file); } } - deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath); + LoadMetadataDetails loadMetaDataDetail = new LoadMetadataDetails(); + loadMetaDataDetail.setSegmentStatus(segmentStatus); + loadMetaDataDetail.setLoadName(segment.getSegmentNo()); + deletePhysicalPartition(partitionSpecs, indexFilesMap, indexOrMergeFiles, tablePath, + loadMetaDataDetail, filesToDelete, timeStamp); String segmentFilePath = CarbonTablePath.getSegmentFilePath(tablePath, segment.getSegmentFileName()); // Deletes the physical segment file FileFactory.deleteFile(segmentFilePath); Review comment: no, do not need to move segment file to the trash folder ########## File path: core/src/main/java/org/apache/carbondata/core/util/path/TrashUtil.java ########## @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.core.util.path; + +import java.io.File; +import java.io.IOException; +import java.sql.Timestamp; +import java.util.ArrayList; +import java.util.List; + +import org.apache.carbondata.common.logging.LogServiceFactory; +import org.apache.carbondata.core.constants.CarbonCommonConstants; +import org.apache.carbondata.core.datastore.filesystem.CarbonFile; +import org.apache.carbondata.core.datastore.impl.FileFactory; +import org.apache.carbondata.core.exception.CarbonFileException; +import org.apache.carbondata.core.util.CarbonUtil; + +import org.apache.commons.io.FileUtils; + +import org.apache.log4j.Logger; + +public final class TrashUtil { + + private static final Logger LOGGER = + LogServiceFactory.getLogService(CarbonUtil.class.getName()); + + /** + * The below method copies the complete a file to the trash folder. Provide necessary + * timestamp and the segment number in the suffixToAdd variable, so that the proper folder is + * created in the trash folder. + */ + public static void moveDataToTrashFolderByFile(String carbonTablePath, String pathOfFileToCopy, + String suffixToAdd) { + String trashFolderPath = carbonTablePath + CarbonCommonConstants.FILE_SEPARATOR + Review comment: done ########## File path: core/src/main/java/org/apache/carbondata/core/metadata/SegmentFileStore.java ########## @@ -1105,28 +1109,79 @@ public static void cleanSegments(CarbonTable table, List<PartitionSpec> partitio * @throws IOException */ public static void deleteSegment(String tablePath, Segment segment, - List<PartitionSpec> partitionSpecs, - SegmentUpdateStatusManager updateStatusManager) throws Exception { + List<PartitionSpec> partitionSpecs, SegmentUpdateStatusManager updateStatusManager, + SegmentStatus segmentStatus, Boolean isPartitionTable, String timeStamp) + throws Exception { SegmentFileStore fileStore = new SegmentFileStore(tablePath, segment.getSegmentFileName()); List<String> indexOrMergeFiles = fileStore.readIndexFiles(SegmentStatus.SUCCESS, true, FileFactory.getConfiguration()); + List<String> filesToDelete = new ArrayList<>(); Map<String, List<String>> indexFilesMap = fileStore.getIndexFilesMap(); for (Map.Entry<String, List<String>> entry : indexFilesMap.entrySet()) { - FileFactory.deleteFile(entry.getKey()); + // Move the file to the trash folder in case the segment status is insert in progress + if (segmentStatus == SegmentStatus.INSERT_IN_PROGRESS) { + if (!isPartitionTable) { + TrashUtil.moveDataToTrashFolderByFile(tablePath, entry.getKey(), timeStamp + Review comment: done ########## File path: core/src/main/java/org/apache/carbondata/core/constants/CarbonCommonConstants.java ########## @@ -1427,6 +1427,25 @@ private CarbonCommonConstants() { public static final String BITSET_PIPE_LINE_DEFAULT = "true"; + public static final String MICROSECONDS_IN_A_DAY = "86400000"; + + /** + * this is the user defined time(in days), when a specific timestamp subdirectory in + * trash folder will expire + */ + @CarbonProperty + public static final String TRASH_EXPIRATION_TIME = "carbon.trash.expiration.time"; Review comment: done ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
vikramahuja1001 commented on a change in pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#discussion_r510658251 ########## File path: processing/src/main/java/org/apache/carbondata/processing/loading/TableProcessingOperations.java ########## @@ -53,12 +52,14 @@ private static final Logger LOGGER = LogServiceFactory.getLogService(CarbonLoaderUtil.class.getName()); + private static List<CarbonFile> filesInTrashFolder = new ArrayList<CarbonFile>(); + /** * delete folder which metadata no exist in tablestatus * this method don't check tablestatus history. */ public static void deletePartialLoadDataIfExist(CarbonTable carbonTable, Review comment: This method is being called from CarbonCleanFIlesCommand class ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714943018 Build Failed with Spark 2.4.5, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.5/2898/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
In reply to this post by GitBox
CarbonDataQA1 commented on pull request #3917: URL: https://github.com/apache/carbondata/pull/3917#issuecomment-714944897 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/4654/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] |
Free forum by Nabble | Edit this page |