use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonFileInputFormat method getSplits.
/**
* get list of block/blocklet and make them to CarbonInputSplit
* @param job JobContext with Configuration
* @return list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
CarbonTable carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
AbsoluteTableIdentifier identifier = carbonTable.getAbsoluteTableIdentifier();
// get all valid segments and set them into the configuration
// check for externalTable segment (Segment_null)
// process and resolve the expression
ReadCommittedScope readCommittedScope;
if (carbonTable.isTransactionalTable()) {
readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath() + "/Fact/Part0/Segment_null/", job.getConfiguration());
} else {
readCommittedScope = getReadCommittedScope(job.getConfiguration());
if (readCommittedScope == null) {
readCommittedScope = new LatestFilesReadCommittedScope(identifier.getTablePath(), job.getConfiguration());
} else {
readCommittedScope.setConfiguration(job.getConfiguration());
}
}
// this will be null in case of corrupt schema file.
IndexFilter filter = getFilterPredicates(job.getConfiguration());
// if external table Segments are found, add it to the List
List<Segment> externalTableSegments = new ArrayList<>();
Segment seg;
if (carbonTable.isTransactionalTable()) {
// SDK some cases write into the Segment Path instead of Table Path i.e. inside
// the "Fact/Part0/Segment_null". The segment in this case is named as "null".
// The table is denoted by default as a transactional table and goes through
// the path of CarbonFileInputFormat. The above scenario is handled in the below code.
seg = new Segment("null", null, readCommittedScope);
externalTableSegments.add(seg);
} else {
LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
for (LoadMetadataDetails load : loadMetadataDetails) {
seg = new Segment(load.getLoadName(), null, readCommittedScope);
if (fileLists != null) {
for (Object fileList : fileLists) {
String timestamp = CarbonTablePath.DataFileUtil.getTimeStampFromFileName(fileList.toString());
if (timestamp.equals(seg.getSegmentNo())) {
externalTableSegments.add(seg);
break;
}
}
} else {
externalTableSegments.add(seg);
}
}
}
List<InputSplit> splits = new ArrayList<>();
boolean useBlockIndex = job.getConfiguration().getBoolean("filter_blocks", true);
// scenarios
if (filter != null) {
filter.resolve(false);
}
if (useBlockIndex) {
// do block filtering and get split
splits = getSplits(job, filter, externalTableSegments);
} else {
List<CarbonFile> carbonFiles;
if (null != this.fileLists) {
carbonFiles = getAllCarbonDataFiles(this.fileLists);
} else {
carbonFiles = getAllCarbonDataFiles(carbonTable.getTablePath());
}
List<String> allDeleteDeltaFiles = getAllDeleteDeltaFiles(carbonTable.getTablePath());
for (CarbonFile carbonFile : carbonFiles) {
// Segment id is set to null because SDK does not write carbondata files with respect
// to segments. So no specific name is present for this load.
CarbonInputSplit split = new CarbonInputSplit("null", carbonFile.getAbsolutePath(), 0, carbonFile.getLength(), carbonFile.getLocations(), FileFormat.COLUMNAR_V3);
split.setVersion(ColumnarFormatVersion.V3);
BlockletDetailInfo info = new BlockletDetailInfo();
split.setDetailInfo(info);
info.setBlockSize(carbonFile.getLength());
info.setVersionNumber(split.getVersion().number());
info.setUseMinMaxForPruning(false);
if (CollectionUtils.isNotEmpty(allDeleteDeltaFiles)) {
split.setDeleteDeltaFiles(getDeleteDeltaFiles(carbonFile.getAbsolutePath(), allDeleteDeltaFiles));
}
splits.add(split);
}
splits.sort(Comparator.comparing(o -> ((CarbonInputSplit) o).getFilePath()));
}
setAllColumnProjectionIfNotConfigured(job, carbonTable);
return splits;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonTableInputFormat method updateLoadMetaDataDetailsToSegments.
public void updateLoadMetaDataDetailsToSegments(List<Segment> validSegments, List<org.apache.carbondata.hadoop.CarbonInputSplit> prunedSplits) {
Map<String, Segment> validSegmentsMap = validSegments.stream().collect(Collectors.toMap(Segment::getSegmentNo, segment -> segment, (e1, e2) -> e1));
for (CarbonInputSplit split : prunedSplits) {
Segment segment = split.getSegment();
if (segment.getLoadMetadataDetails() == null || segment.getReadCommittedScope() == null) {
if (validSegmentsMap.containsKey(segment.getSegmentNo())) {
segment.setLoadMetadataDetails(validSegmentsMap.get(segment.getSegmentNo()).getLoadMetadataDetails());
segment.setReadCommittedScope(validSegmentsMap.get(segment.getSegmentNo()).getReadCommittedScope());
}
}
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputFormat method checkAndAddImplicitExpression.
/**
* This method will create an Implicit Expression and set it as right child in the given
* expression
*/
private void checkAndAddImplicitExpression(IndexFilter indexFilter, InputSplit inputSplit, boolean hasColumnDrift) {
if (inputSplit instanceof CarbonMultiBlockSplit) {
CarbonMultiBlockSplit split = (CarbonMultiBlockSplit) inputSplit;
List<CarbonInputSplit> splits = split.getAllSplits();
// iterate over all the splits and create block to bblocklet mapping
Map<String, Set<Integer>> blockIdToBlockletIdMapping = new HashMap<>();
for (CarbonInputSplit carbonInputSplit : splits) {
Set<Integer> validBlockletIds = carbonInputSplit.getValidBlockletIds();
if (null != validBlockletIds && !validBlockletIds.isEmpty()) {
String uniqueBlockPath = carbonInputSplit.getFilePath();
String shortBlockPath = CarbonTablePath.getShortBlockId(uniqueBlockPath.substring(uniqueBlockPath.lastIndexOf("/Part") + 1));
blockIdToBlockletIdMapping.put(shortBlockPath, validBlockletIds);
}
}
if (!blockIdToBlockletIdMapping.isEmpty()) {
// create implicit expression and set as right child
FilterUtil.createImplicitExpressionAndSetAsRightChild(indexFilter.getExpression(), blockIdToBlockletIdMapping);
// then the filter expression will be processed during QueryModelBuilder.build()
if (hasColumnDrift) {
indexFilter.processFilterExpression();
}
}
}
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputSplitTaskInfo method getLocations.
@Override
public String[] getLocations() {
Set<String> locations = new HashSet<>();
for (CarbonInputSplit splitInfo : carbonBlockInfoList) {
try {
locations.addAll(Arrays.asList(splitInfo.getLocations()));
} catch (IOException e) {
throw new RuntimeException("Fail to get location of split: " + splitInfo, e);
}
}
locations.toArray(new String[locations.size()]);
List<String> nodes = CarbonInputSplitTaskInfo.maxNoNodes(carbonBlockInfoList);
return nodes.toArray(new String[nodes.size()]);
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonInputSplitTaskInfo method maxNoNodes.
/**
* Finding which node has the maximum number of blocks for it.
*/
public static List<String> maxNoNodes(List<CarbonInputSplit> splitList) {
boolean useIndex = true;
Integer maxOccurrence = 0;
String maxNode = null;
Map<String, Integer> nodeAndOccurrenceMapping = new TreeMap<>();
// populate the map of node and number of occurrences of that node.
for (CarbonInputSplit split : splitList) {
try {
for (String node : split.getLocations()) {
nodeAndOccurrenceMapping.putIfAbsent(node, 1);
}
} catch (IOException e) {
throw new RuntimeException("Fail to get location of split: " + split, e);
}
}
Integer previousValueOccurrence = null;
// check which node is occurred maximum times.
for (Map.Entry<String, Integer> entry : nodeAndOccurrenceMapping.entrySet()) {
// finding the maximum node.
if (entry.getValue() > maxOccurrence) {
maxOccurrence = entry.getValue();
maxNode = entry.getKey();
}
// first time scenario. initializing the previous value.
if (null == previousValueOccurrence) {
previousValueOccurrence = entry.getValue();
} else {
// we need to return complete list instead of max node.
if (!Objects.equals(previousValueOccurrence, entry.getValue())) {
useIndex = false;
}
}
}
// if all the nodes have equal occurrence then returning the complete key set.
if (useIndex) {
return new ArrayList<>(nodeAndOccurrenceMapping.keySet());
}
// if any max node is found then returning the max node.
List<String> node = new ArrayList<>(1);
node.add(maxNode);
return node;
}
Aggregations