use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonTableInputFormat method getSplitsOfStreaming.
/**
* use file list in .carbonindex file to get the split of streaming.
*/
public List<InputSplit> getSplitsOfStreaming(JobContext job, List<Segment> streamSegments, CarbonTable carbonTable, FilterResolverIntf filterResolverIntf) throws IOException {
List<InputSplit> splits = new ArrayList<>();
if (streamSegments != null && !streamSegments.isEmpty()) {
numStreamSegments = streamSegments.size();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
if (filterResolverIntf == null) {
if (carbonTable != null) {
IndexFilter filter = getFilterPredicates(job.getConfiguration());
if (filter != null) {
filter.processFilterExpression();
filterResolverIntf = filter.getResolver();
}
}
}
StreamPruner streamPruner = new StreamPruner(carbonTable);
streamPruner.init(filterResolverIntf);
List<StreamFile> streamFiles = streamPruner.prune(streamSegments);
// record the hit information of the streaming files
this.hitStreamFiles = streamFiles.size();
this.numStreamFiles = streamPruner.getTotalFileNums();
for (StreamFile streamFile : streamFiles) {
Path path = new Path(streamFile.getFilePath());
long length = streamFile.getFileSize();
if (length != 0) {
BlockLocation[] blkLocations;
FileSystem fs = FileFactory.getFileSystem(path);
FileStatus file = fs.getFileStatus(path);
blkLocations = fs.getFileBlockLocations(path, 0, length);
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
// there is 10% slop to avoid to generate very small split in the end
while (((double) bytesRemaining) / splitSize > 1.1) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(streamFile.getSegmentNo(), streamFile.getFilePath(), length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts(), FileFormat.ROW_V1));
}
}
}
}
return splits;
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonTableInputFormat method getSplits.
/**
* get list of block/blocklet and make them to CarbonInputSplit
* @param job JobContext with Configuration
* @return list of CarbonInputSplit
* @throws IOException
*/
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
carbonTable = getOrCreateCarbonTable(job.getConfiguration());
if (null == carbonTable) {
throw new IOException("Missing/Corrupt schema file for table.");
}
// global dictionary is not supported since 2.0
if (carbonTable.getTableInfo().getFactTable().getTableProperties().containsKey(CarbonCommonConstants.DICTIONARY_INCLUDE)) {
DeprecatedFeatureException.globalDictNotSupported();
}
List<InputSplit> splits = new LinkedList<>();
if (CarbonProperties.isQueryStageInputEnabled()) {
// included for the query
try {
List<InputSplit> stageInputSplits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration());
splits.addAll(stageInputSplits);
} catch (ExecutionException | InterruptedException e) {
LOG.error("Failed to create input splits from stage files", e);
throw new IOException(e);
}
}
this.readCommittedScope = getReadCommitted(job, carbonTable.getAbsoluteTableIdentifier());
LoadMetadataDetails[] loadMetadataDetails = readCommittedScope.getSegmentList();
String updateDeltaVersion = job.getConfiguration().get(UPDATE_DELTA_VERSION);
SegmentUpdateStatusManager updateStatusManager;
if (updateDeltaVersion != null) {
updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails, updateDeltaVersion);
} else {
updateStatusManager = new SegmentUpdateStatusManager(carbonTable, loadMetadataDetails);
}
List<String> invalidSegmentIds = new ArrayList<>();
List<Segment> streamSegments = null;
// get all valid segments and set them into the configuration
SegmentStatusManager segmentStatusManager = new SegmentStatusManager(carbonTable.getAbsoluteTableIdentifier(), readCommittedScope.getConfiguration());
SegmentStatusManager.ValidAndInvalidSegmentsInfo segments = segmentStatusManager.getValidAndInvalidSegments(carbonTable.isMV(), loadMetadataDetails, this.readCommittedScope);
if (getValidateSegmentsToAccess(job.getConfiguration())) {
List<Segment> validSegments = segments.getValidSegments();
streamSegments = segments.getStreamSegments();
streamSegments = getFilteredSegment(job, streamSegments, true, readCommittedScope);
if (validSegments.size() == 0) {
splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
return splits;
}
List<Segment> filteredSegmentToAccess = getFilteredSegment(job, segments.getValidSegments(), true, readCommittedScope);
if (filteredSegmentToAccess.size() == 0) {
splits.addAll(getSplitsOfStreaming(job, streamSegments, carbonTable));
return splits;
} else {
setSegmentsToAccess(job.getConfiguration(), filteredSegmentToAccess);
}
// remove entry in the segment index if there are invalid segments
for (Segment segment : segments.getInvalidSegments()) {
invalidSegmentIds.add(segment.getSegmentNo());
}
if (invalidSegmentIds.size() > 0) {
IndexStoreManager.getInstance().clearInvalidSegments(getOrCreateCarbonTable(job.getConfiguration()), invalidSegmentIds);
}
}
List<Segment> validAndInProgressSegments = new ArrayList<>(segments.getValidSegments());
// Add in progress segments also to filter it as in case of Secondary Index table load it loads
// data from in progress table.
validAndInProgressSegments.addAll(segments.getListOfInProgressSegments());
List<Segment> segmentToAccess = getFilteredSegment(job, validAndInProgressSegments, false, readCommittedScope);
String segmentFileName = job.getConfiguration().get(CarbonCommonConstants.CURRENT_SEGMENTFILE);
if (segmentFileName != null) {
// per segment it has only one file("current.segment")
segmentToAccess.get(0).setSegmentFileName(segmentFileName + CarbonTablePath.SEGMENT_EXT);
}
// process and resolve the expression
IndexFilter indexFilter = getFilterPredicates(job.getConfiguration());
if (indexFilter != null) {
indexFilter.resolve(false);
}
// do block filtering and get split
List<InputSplit> batchSplits = getSplits(job, indexFilter, segmentToAccess, updateStatusManager, segments.getInvalidSegments());
splits.addAll(batchSplits);
// add all splits of streaming
List<InputSplit> splitsOfStreaming = getSplitsOfStreaming(job, streamSegments, carbonTable);
if (!splitsOfStreaming.isEmpty()) {
splits.addAll(splitsOfStreaming);
}
return splits;
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonInputFormat method getFilterPredicates.
public IndexFilter getFilterPredicates(Configuration configuration) {
try {
String filterExprString = configuration.get(FILTER_PREDICATE);
if (filterExprString == null) {
return null;
}
IndexFilter filter = (IndexFilter) ObjectSerializationUtil.convertStringToObject(filterExprString);
if (filter != null) {
CarbonTable carbonTable = getOrCreateCarbonTable(configuration);
filter.setTable(carbonTable);
}
return filter;
} catch (IOException e) {
throw new RuntimeException("Error while reading filter expression", e);
}
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonTableInputFormatTest method runJob.
private void runJob(String outPath, CarbonProjection projection, Expression filter) throws Exception {
Configuration configuration = new Configuration();
configuration.set("mapreduce.cluster.local.dir", new File(outPath + "1").getCanonicalPath());
Job job = Job.getInstance(configuration);
job.setJarByClass(CarbonTableInputFormatTest.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(Map.class);
job.setInputFormatClass(CarbonTableInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
AbsoluteTableIdentifier abs = creator.getAbsoluteTableIdentifier();
if (projection != null) {
CarbonTableInputFormat.setColumnProjection(job.getConfiguration(), projection);
}
if (filter != null) {
CarbonTableInputFormat.setFilterPredicates(job.getConfiguration(), new IndexFilter(loadModel.getCarbonDataLoadSchema().getCarbonTable(), filter));
}
CarbonTableInputFormat.setDatabaseName(job.getConfiguration(), abs.getCarbonTableIdentifier().getDatabaseName());
CarbonTableInputFormat.setTableName(job.getConfiguration(), abs.getCarbonTableIdentifier().getTableName());
FileInputFormat.addInputPath(job, new Path(abs.getTablePath()));
CarbonUtil.deleteFoldersAndFiles(new File(outPath + "1"));
FileOutputFormat.setOutputPath(job, new Path(outPath + "1"));
job.getConfiguration().set("outpath", outPath);
job.getConfiguration().set("query.id", String.valueOf(System.nanoTime()));
boolean status = job.waitForCompletion(true);
}
use of org.apache.carbondata.core.index.IndexFilter in project carbondata by apache.
the class CarbonTableInputFormatTest method testGetFilteredSplits.
@Test
public void testGetFilteredSplits() throws Exception {
CarbonTableInputFormat carbonInputFormat = new CarbonTableInputFormat();
JobConf jobConf = new JobConf(new Configuration());
Job job = Job.getInstance(jobConf);
job.getConfiguration().set("query.id", UUID.randomUUID().toString());
String tblPath = creator.getAbsoluteTableIdentifier().getTablePath();
FileInputFormat.addInputPath(job, new Path(tblPath));
CarbonTableInputFormat.setDatabaseName(job.getConfiguration(), creator.getAbsoluteTableIdentifier().getDatabaseName());
CarbonTableInputFormat.setTableName(job.getConfiguration(), creator.getAbsoluteTableIdentifier().getTableName());
Expression expression = new EqualToExpression(new ColumnExpression("country", DataTypes.STRING), new LiteralExpression("china", DataTypes.STRING));
CarbonTableInputFormat.setFilterPredicates(job.getConfiguration(), new IndexFilter(loadModel.getCarbonDataLoadSchema().getCarbonTable(), expression));
List splits = carbonInputFormat.getSplits(job);
Assert.assertTrue(splits != null);
Assert.assertTrue(!splits.isEmpty());
}
Aggregations