Apache CarbonData Dev Mailing List archive › Apache CarbonData JIRA issues

[GitHub] carbondata pull request #2200: [CARBONDATA-2373][DataMap] Add bloom datamap ...

Classic

List

Threaded

34 messages Options

qiuchenjian-2

[GitHub] carbondata pull request #2200: [CARBONDATA-2373][DataMap] Add bloom datamap ...

Github user jackylk commented on a diff in the pull request:

https://github.com/apache/carbondata/pull/2200#discussion_r183210908

--- Diff: datamap/bloom/src/main/java/org/apache/carbondata/datamap/bloom/BloomDataMapWriter.java ---
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.carbondata.datamap.bloom;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.carbondata.common.annotations.InterfaceAudience;
+import org.apache.carbondata.core.datamap.DataMapMeta;
+import org.apache.carbondata.core.datamap.Segment;
+import org.apache.carbondata.core.datamap.dev.DataMapWriter;
+import org.apache.carbondata.core.datastore.impl.FileFactory;
+import org.apache.carbondata.core.datastore.page.ColumnPage;
+import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
+import org.apache.carbondata.core.metadata.datatype.DataType;
+import org.apache.carbondata.core.metadata.datatype.DataTypes;
+import org.apache.carbondata.core.util.CarbonUtil;
+
+import com.google.common.hash.BloomFilter;
+import com.google.common.hash.Funnels;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+@InterfaceAudience.Internal
+public class BloomDataMapWriter extends DataMapWriter {
+ /**
+ * suppose one blocklet contains 20 page and all the indexed value is distinct.
+ * later we can make it configurable.
+ */
+ private static final int BLOOM_FILTER_SIZE = 32000 * 20;
+ private String dataMapName;
+ private List<String> indexedColumns;
+ // map column name to ordinal in pages
+ private Map<String, Integer> col2Ordianl;
+ private Map<String, DataType> col2DataType;
+ private String currentBlockId;
+ private int currentBlockletId;
+ private List<String> currentDMFiles;
+ private List<DataOutputStream> currentDataOutStreams;
+ private List<ObjectOutputStream> currentObjectOutStreams;
+ private List<BloomFilter<byte[]>> indexBloomFilters;
+
+ public BloomDataMapWriter(AbsoluteTableIdentifier identifier, DataMapMeta dataMapMeta,
+ Segment segment, String writeDirectoryPath) {
+ super(identifier, segment, writeDirectoryPath);
+ dataMapName = dataMapMeta.getDataMapName();
+ indexedColumns = dataMapMeta.getIndexedColumns();
+ col2Ordianl = new HashMap<String, Integer>(indexedColumns.size());
+ col2DataType = new HashMap<String, DataType>(indexedColumns.size());
+
+ currentDMFiles = new ArrayList<String>(indexedColumns.size());
+ currentDataOutStreams = new ArrayList<DataOutputStream>(indexedColumns.size());
+ currentObjectOutStreams = new ArrayList<ObjectOutputStream>(indexedColumns.size());
+
+ indexBloomFilters = new ArrayList<BloomFilter<byte[]>>(indexedColumns.size());
+ }
+
+ @Override
+ public void onBlockStart(String blockId, long taskId) throws IOException {
+ this.currentBlockId = blockId;
+ this.currentBlockletId = 0;
+ currentDMFiles.clear();
+ currentDataOutStreams.clear();
+ currentObjectOutStreams.clear();
+ initDataMapFile();
+ }
+
+ @Override
+ public void onBlockEnd(String blockId) throws IOException {
+ for (int indexColId = 0; indexColId < indexedColumns.size(); indexColId++) {
+ CarbonUtil.closeStreams(this.currentDataOutStreams.get(indexColId),
+ this.currentObjectOutStreams.get(indexColId));
+ commitFile(this.currentDMFiles.get(indexColId));
+ }
+ }
+
+ @Override public void onBlockletStart(int blockletId) {
+ this.currentBlockletId = blockletId;
+ indexBloomFilters.clear();
+ for (int i = 0; i < indexedColumns.size(); i++) {
+ indexBloomFilters.add(BloomFilter.create(Funnels.byteArrayFunnel(),
+ BLOOM_FILTER_SIZE, 0.00001d));
+ }
+ }
+
+ @Override
+ public void onBlockletEnd(int blockletId) {
+ try {
+ writeBloomDataMapFile();
+ } catch (Exception e) {
+ for (ObjectOutputStream objectOutputStream : currentObjectOutStreams) {
+ CarbonUtil.closeStreams(objectOutputStream);
+ }
+ for (DataOutputStream dataOutputStream : currentDataOutStreams) {
+ CarbonUtil.closeStreams(dataOutputStream);
+ }
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override public void onPageAdded(int blockletId, int pageId, ColumnPage[] pages)
+ throws IOException {
+ col2Ordianl.clear();
+ col2DataType.clear();
+ for (int colId = 0; colId < pages.length; colId++) {
+ String columnName = pages[colId].getColumnSpec().getFieldName().toLowerCase();
+ if (indexedColumns.contains(columnName)) {
--- End diff --

This is not needed. The input `pages` contains the indexed column only

---

qiuchenjian-2