kevinjmh commented on a change in pull request #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#discussion_r379266709 ########## File path: processing/src/main/java/org/apache/carbondata/processing/sort/sortdata/SortIntermediateFileMerger.java ########## @@ -72,20 +67,11 @@ public void addFileToMerge(File sortTempFile) { // intermediate merging of sort temp files will be triggered synchronized (lockObject) { procFiles.add(sortTempFile); - } - } - - public void startMergingIfPossible() { - File[] fileList; - if (procFiles.size() >= parameters.getNumberOfIntermediateFileToBeMerged()) { - synchronized (lockObject) { - fileList = procFiles.toArray(new File[procFiles.size()]); - this.procFiles = new ArrayList<File>(); + if (procFiles.size() >= parameters.getNumberOfIntermediateFileToBeMerged()) { + File[] fileList = procFiles.toArray(new File[procFiles.size()]); + this.procFiles = new ArrayList<>(); + startIntermediateMerging(fileList); Review comment: that will cause ClassCastException ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
kevinjmh commented on a change in pull request #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#discussion_r379267683 ########## File path: processing/src/main/java/org/apache/carbondata/processing/sort/sortdata/SortDataRows.java ########## @@ -133,62 +94,74 @@ public void addRow(Object[] row) throws CarbonSortKeyAndGroupByException { if (LOGGER.isDebugEnabled()) { LOGGER.debug("************ Writing to temp file ********** "); } - intermediateFileMerger.startMergingIfPossible(); Object[][] recordHolderListLocal = recordHolderList; - try { - semaphore.acquire(); - dataSorterAndWriterExecutorService.execute(new DataSorterAndWriter(recordHolderListLocal)); - } catch (InterruptedException e) { - LOGGER.error("exception occurred while trying to acquire a semaphore lock: ", e); - throw new CarbonSortKeyAndGroupByException(e); - } + handlePreviousPage(recordHolderListLocal); // create the new holder Array this.recordHolderList = new Object[this.sortBufferSize][]; this.entryCount = 0; } recordHolderList[entryCount++] = row; } - /** - * This method will be used to add new row - * - * @param rowBatch new rowBatch - * @throws CarbonSortKeyAndGroupByException problem while writing - */ public void addRowBatch(Object[][] rowBatch, int size) throws CarbonSortKeyAndGroupByException { // if record holder list size is equal to sort buffer size then it will // sort the list and then write current list data to file - synchronized (addRowsLock) { - int sizeLeft = 0; - if (entryCount + size >= sortBufferSize) { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("************ Writing to temp file ********** "); - } - intermediateFileMerger.startMergingIfPossible(); - Object[][] recordHolderListLocal = recordHolderList; - sizeLeft = sortBufferSize - entryCount; - if (sizeLeft > 0) { - System.arraycopy(rowBatch, 0, recordHolderListLocal, entryCount, sizeLeft); - } - try { - semaphore.acquire(); - dataSorterAndWriterExecutorService - .execute(new DataSorterAndWriter(recordHolderListLocal)); - } catch (Exception e) { - LOGGER.error( - "exception occurred while trying to acquire a semaphore lock: " + e.getMessage(), e); - throw new CarbonSortKeyAndGroupByException(e); - } - // create the new holder Array - this.recordHolderList = new Object[this.sortBufferSize][]; - this.entryCount = 0; - size = size - sizeLeft; - if (size == 0) { - return; - } + int sizeLeft = 0; + if (entryCount + size >= sortBufferSize) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("************ Writing to temp file ********** "); } - System.arraycopy(rowBatch, sizeLeft, recordHolderList, entryCount, size); - entryCount += size; + Object[][] recordHolderListLocal = recordHolderList; + sizeLeft = sortBufferSize - entryCount; + if (sizeLeft > 0) { + System.arraycopy(rowBatch, 0, recordHolderListLocal, entryCount, sizeLeft); + } + handlePreviousPage(recordHolderListLocal); + // create the new holder Array + this.recordHolderList = new Object[this.sortBufferSize][]; + this.entryCount = 0; + size = size - sizeLeft; + if (size == 0) { + return; + } + } + System.arraycopy(rowBatch, sizeLeft, recordHolderList, entryCount, size); + entryCount += size; + } + + /** + * sort and write data + * @param recordHolderArray + */ + private void handlePreviousPage(Object[][] recordHolderArray) + throws CarbonSortKeyAndGroupByException { + try { + long startTime = System.currentTimeMillis(); + if (parameters.getNumberOfNoDictSortColumns() > 0) { + Arrays.sort(recordHolderArray, + new NewRowComparator(parameters.getNoDictionarySortColumn(), + parameters.getNoDictDataType())); + } else { + Arrays.sort(recordHolderArray, + new NewRowComparatorForNormalDims(parameters.getNumberOfSortColumns())); + } + + // create a new file and choose folder randomly every time + String[] tmpFileLocation = parameters.getTempFileLocation(); + String locationChosen = tmpFileLocation[new Random().nextInt(tmpFileLocation.length)]; + File sortTempFile = new File( + locationChosen + File.separator + parameters.getTableName() + + '_' + parameters.getRangeId() + '_' + System.nanoTime() + + CarbonCommonConstants.SORT_TEMP_FILE_EXT); + writeDataToFile(recordHolderArray, recordHolderArray.length, sortTempFile); + // add sort temp filename to and arrayList. When the list size reaches 20 then Review comment: ok ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586553206 Build Failed with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/302/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586553316 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/2006/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586555694 Build Success with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/303/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586556854 Build Success with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/304/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586560551 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/2008/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586794541 Build Success with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/311/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586805122 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/2014/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
kevinjmh commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586837885 retest this please ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586843945 Build Success with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/314/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-586860718 Build Failed with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/2017/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-587317418 Build Success with Spark 2.4.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbon_PR_Builder_2.4.4/330/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
CarbonDataQA1 commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-587354943 Build Success with Spark 2.3.4, Please check CI http://121.244.95.60:12545/job/ApacheCarbonPRBuilder2.3/2032/ ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
ajantha-bhat commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-588047795 LGTM. @Kevinjmh: can you please test once huge data (spill to disk) because the current test case will not have huge data, most of the local sort happens in memory. Also concurrent loading with local sort test, as sort folder that we are removing should not clean up the other. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
kevinjmh commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-588093661 @ajantha-bhat thanks for your reminder. I did a local test with input which is about 70GB, spill a lot to disk, loading is ok. And the concurrent loading(also tested), every load will request a temp locations with uuid from code below, so the sort temp folder won't be affected. This pr only changes the temp file name under those folder. https://github.com/apache/carbondata/blob/252c789b362a7920663ce177ed7c5c6f27a1e08e/integration/spark-common/src/main/scala/org/apache/carbondata/spark/util/CommonUtil.scala#L776-L783 By the way, recommand to config `yarn.nodemanager.local-dirs` to make use of multiple temp folder when spilling sorttmp files to disks, see #2824 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
asfgit closed pull request #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
In reply to this post by GitBox
ajantha-bhat commented on issue #3603: [CARBONDATA-3679] Optimize local sort performance
URL: https://github.com/apache/carbondata/pull/3603#issuecomment-588618821 @kevinjmh : OK merged. Thanks for your contribution. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [hidden email] With regards, Apache Git Services |
Free forum by Nabble | Edit this page |