use of org.apache.jena.riot.RDFParserBuilder in project jena by apache.
the class AbstractBlockBasedNodeTupleReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
LOG.debug("initialize({}, {})", genericSplit, context);
// Assuming file split
if (!(genericSplit instanceof FileSplit))
throw new IOException("This record reader only supports FileSplit inputs");
FileSplit split = (FileSplit) genericSplit;
// Configuration
Configuration config = context.getConfiguration();
this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
if (this.ignoreBadTuples)
LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
// Figure out what portion of the file to read
start = split.getStart();
long end = start + split.getLength();
final Path file = split.getPath();
long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
boolean readToEnd = end == totalLength;
CompressionCodecFactory factory = new CompressionCodecFactory(config);
this.compressionCodecs = factory.getCodec(file);
LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
// Open the file and prepare the input stream
FileSystem fs = file.getFileSystem(config);
FSDataInputStream fileIn = fs.open(file);
this.length = split.getLength();
if (start > 0)
fileIn.seek(start);
if (this.compressionCodecs != null) {
// Compressed input
// For compressed input NLineInputFormat will have failed to find
// any line breaks and will give us a split from 0 -> (length - 1)
// Add 1 and re-verify readToEnd so we can abort correctly if ever
// given a partial split of a compressed file
end++;
readToEnd = end == totalLength;
if (start > 0 || !readToEnd)
throw new IOException("This record reader can only be used with compressed input where the split is a whole file");
input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
} else {
if (readToEnd) {
input = new TrackedInputStream(fileIn);
} else {
// Need to limit the portion of the file we are reading
input = new BlockInputStream(fileIn, split.getLength());
}
}
// Set up background thread for parser
iter = this.getPipedIterator();
this.stream = this.getPipedStream(iter, this.input);
RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
this.parserThread = new Thread(parserRunnable);
this.parserThread.setDaemon(true);
this.parserThread.start();
}
use of org.apache.jena.riot.RDFParserBuilder in project jena by apache.
the class TestIRIxRIOT method testLang.
private static void testLang(String iri, Lang lang, String base, Optional<Boolean> strict, Optional<Boolean> checking, int numErrors, int numWarnings) {
InputStream in = generateSource(iri);
RDFParserBuilder builder = RDFParser.source(in).forceLang(lang);
builder.base(base);
if (strict.isPresent())
builder.strict(strict.get());
if (checking.isPresent())
builder.checking(checking.get());
runTest(builder, iri, numErrors, numWarnings);
}
use of org.apache.jena.riot.RDFParserBuilder in project jena by apache.
the class AbstractWholeFileNodeTupleReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
LOG.debug("initialize({}, {})", genericSplit, context);
// Assuming file split
if (!(genericSplit instanceof FileSplit))
throw new IOException("This record reader only supports FileSplit inputs");
FileSplit split = (FileSplit) genericSplit;
// Configuration
Configuration config = context.getConfiguration();
this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
if (this.ignoreBadTuples)
LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
// Figure out what portion of the file to read
if (split.getStart() > 0)
throw new IOException("This record reader requires a file split which covers the entire file");
final Path file = split.getPath();
long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
CompressionCodecFactory factory = new CompressionCodecFactory(config);
this.compressionCodecs = factory.getCodec(file);
LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength }));
if (totalLength > split.getLength())
throw new IOException("This record reader requires a file split which covers the entire file");
// Open the file and prepare the input stream
FileSystem fs = file.getFileSystem(config);
FSDataInputStream fileIn = fs.open(file);
this.length = split.getLength();
if (this.compressionCodecs != null) {
// Compressed input
input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
} else {
// Uncompressed input
input = new TrackedInputStream(fileIn);
}
// Set up background thread for parser
iter = this.getPipedIterator();
this.stream = this.getPipedStream(iter, this.input);
RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
this.parserThread = new Thread(parserRunnable);
this.parserThread.setDaemon(true);
this.parserThread.start();
}
use of org.apache.jena.riot.RDFParserBuilder in project jena by apache.
the class RdfIOUtils method createRDFParserBuilder.
public static RDFParserBuilder createRDFParserBuilder(JobContext context, Path path) {
LabelToNode labelMapping = createLabelToNode(context, path);
RDFParserBuilder builder = RDFParser.create().labelToNode(labelMapping).errorHandler(ErrorHandlerFactory.errorHandlerStd);
return builder;
}
use of org.apache.jena.riot.RDFParserBuilder in project jena by apache.
the class TestIRIxRIOT method testTTL.
// Turtle, with resolver
private static void testTTL(String iri, IRIxResolver resolver, int numErrors, int numWarnings) {
InputStream in = generateSource(iri);
RDFParserBuilder builder = RDFParser.source(in).forceLang(Lang.TTL).resolver(resolver);
runTest(builder, iri, numErrors, numWarnings);
}
Aggregations