use of org.apache.jena.irix.IRIProvider in project jena by apache.
the class ProcNodeTableDataBuilder method exec.
public static void exec(Location location, String dataFileTriples, String dataFileQuads, List<String> datafiles, boolean collectStats) {
// Possible parser speed up. This has no effect if parsing in parallel
// because the parser isn't the slowest step in loading at scale.
IRIProvider provider = SystemIRIx.getProvider();
// SystemIRIx.setProvider(new IRIProviderAny());
// This formats the location correctly.
// But we're not really interested in it all.
DatasetGraphTDB dsg = DatasetBuilderStd.create(location);
// so close indexes and the prefix table.
dsg.getTripleTable().getNodeTupleTable().getTupleTable().close();
dsg.getQuadTable().getNodeTupleTable().getTupleTable().close();
ProgressMonitor monitor = ProgressMonitorOutput.create(cmdLog, "Data", BulkLoader.DataTickPoint, BulkLoader.superTick);
// WriteRows does it's own buffering and has direct write-to-buffer.
// Do not buffer here.
OutputStream outputTriples = IO.openOutputFile(dataFileTriples);
OutputStream outputQuads = IO.openOutputFile(dataFileQuads);
build(dsg, monitor, outputTriples, outputQuads, datafiles);
TDBInternal.expel(dsg);
SystemIRIx.setProvider(provider);
}
use of org.apache.jena.irix.IRIProvider in project jena by apache.
the class ProcBuildNodeTableX method exec2.
/**
* Pair<triples, indexed nodes>
* @param sortThreads
*/
// [BULK] Output, not return.
private static Pair<Long, Long> exec2(String DB, XLoaderFiles loaderFiles, int sortThreads, String sortNodeTableArgs, List<String> datafiles) {
// Threads - 1 parser, 1 builder, 2 sort.
// Steps:
// 1 - parser to and pipe terms to sort
// 2 - sort
// 3 - build node table from unique sort
IRIProvider provider = SystemIRIx.getProvider();
// SystemIRIx.setProvider(new IRIProviderAny());
DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(DB);
DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
NodeTable nt = dsgtdb.getTripleTable().getNodeTupleTable().getNodeTable();
NodeTableTRDF nodeTable = (NodeTableTRDF) nt.baseNodeTable();
OutputStream toSortOutputStream;
InputStream fromSortInputStream;
if (sortThreads <= 0)
sortThreads = 2;
// ** Step 2: The sort
Process procSort;
try {
// LOG.info("Step : external sort");
// Mutable list.
List<String> sortCmd = new ArrayList<>(Arrays.asList("sort", "--temporary-directory=" + loaderFiles.TMPDIR, "--buffer-size=50%", "--parallel=" + sortThreads, "--unique", "--key=1,1"));
if (BulkLoaderX.CompressSortNodeTableFiles)
sortCmd.add("--compress-program=" + BulkLoaderX.gzipProgram());
// if ( sortNodeTableArgs != null ) {}
ProcessBuilder pb2 = new ProcessBuilder(sortCmd);
pb2.environment().put("LC_ALL", "C");
procSort = pb2.start();
// To process.
// Let the writer close it.
toSortOutputStream = procSort.getOutputStream();
// From process to the tree builder.
// Let the reader side close it.
fromSortInputStream = procSort.getInputStream();
// // Debug sort process.
// InputStream fromSortErrortStream = proc2.getErrorStream();
// IOUtils.copy(fromSortErrortStream, System.err);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
// ** Step 1 : write intermediate file (hash, thrift bytes).
AtomicLong countParseTicks = new AtomicLong(-1);
AtomicLong countIndexedNodes = new AtomicLong(-1);
long tickPoint = BulkLoaderX.DataTick;
int superTick = BulkLoaderX.DataSuperTick;
Runnable task1 = () -> {
ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Nodes, "Nodes", tickPoint, superTick);
OutputStream output = IO.ensureBuffered(toSortOutputStream);
// Counting.
StreamRDF worker = new NodeHashTmpStream(output);
ProgressStreamRDF stream = new ProgressStreamRDF(worker, monitor);
monitor.start();
String label = monitor.getLabel();
datafiles.forEach(datafile -> {
String basename = FileOps.basename(datafile);
monitor.setLabel(basename);
stream.start();
RDFParser.source(datafile).parse(stream);
stream.finish();
});
monitor.finish();
monitor.setLabel(label);
IO.flush(output);
IO.close(output);
long x = monitor.getTime();
// long x = timer.endTimer();
long count = monitor.getTicks();
countParseTicks.set(count);
double xSec = x / 1000.0;
double rate = count / xSec;
FmtLog.info(BulkLoaderX.LOG_Nodes, "%s Parse (nodes): %s seconds : %,d triples/quads %,.0f TPS", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rate);
};
// [BULK] XXX AsyncParser.asyncParse(files, output)
Thread thread1 = async(task1, "AsyncParser");
// Step3: build node table.
Runnable task3 = () -> {
Timer timer = new Timer();
// Don't start timer until sort send something
// Process stream are already buffered.
InputStream input = IO.ensureBuffered(fromSortInputStream);
FileSet fileSet = new FileSet(dsgtdb.getLocation(), Names.nodeTableBaseName);
BufferChannel blkState = FileFactory.createBufferChannel(fileSet, Names.extBptState);
long idxTickPoint = BulkLoaderX.DataTick;
int idxSuperTick = BulkLoaderX.DataSuperTick;
ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Terms, "Index", idxTickPoint, idxSuperTick);
// Library of tools!
dsg.executeWrite(() -> {
BinaryDataFile objectFile = nodeTable.getData();
Iterator<Record> rIter = records(BulkLoaderX.LOG_Terms, input, objectFile);
rIter = new ProgressIterator<>(rIter, monitor);
// Record of (hash, nodeId)
BPlusTree bpt1 = (BPlusTree) (nodeTable.getIndex());
BPlusTreeParams bptParams = bpt1.getParams();
RecordFactory factory = new RecordFactory(SystemTDB.LenNodeHash, NodeId.SIZE);
// Wait until something has been received from the sort step
rIter.hasNext();
monitor.start();
// .. then start the timer. It is closed after the transaction finishes.
timer.startTimer();
BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(rIter, bptParams, factory, blkState, bpt1.getNodeManager().getBlockMgr(), bpt1.getRecordsMgr().getBlockMgr());
bpt2.sync();
bpt1.sync();
objectFile.sync();
monitor.finish();
});
blkState.sync();
IO.close(input);
long x = timer.endTimer();
long count = monitor.getTicks();
countIndexedNodes.set(count);
String rateStr = BulkLoaderX.rateStr(count, x);
FmtLog.info(BulkLoaderX.LOG_Terms, "%s Index terms: %s seconds : %,d indexed RDF terms : %s PerSecond", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rateStr);
};
Thread thread3 = async(task3, "AsyncBuild");
try {
int exitCode = procSort.waitFor();
if (exitCode != 0) {
String msg = IO.readWholeFileAsUTF8(procSort.getErrorStream());
String logMsg = String.format("Sort RC = %d : Error: %s", exitCode, msg);
Log.error(BulkLoaderX.LOG_Terms, logMsg);
// ** Exit process
System.exit(exitCode);
} else
BulkLoaderX.LOG_Terms.info("Sort finished");
// I/O Stream toSortOutputStream and fromSortInputStream closed by
// their users - step 1 and step 3.
} catch (InterruptedException e) {
BulkLoaderX.LOG_Nodes.error("Failed to cleanly wait-for the subprocess");
throw new RuntimeException(e);
}
BulkLoaderX.waitFor(thread1);
BulkLoaderX.waitFor(thread3);
return Pair.create(countParseTicks.get(), countIndexedNodes.get());
}
use of org.apache.jena.irix.IRIProvider in project jena by apache.
the class ProcIngestDataX method exec.
// Node Table.
public static void exec(String location, XLoaderFiles loaderFiles, List<String> datafiles, boolean collectStats) {
FmtLog.info(BulkLoaderX.LOG_Data, "Ingest data");
// Possible parser speed up. This has no effect if parsing in parallel
// because the parser isn't the slowest step when loading at scale.
IRIProvider provider = SystemIRIx.getProvider();
// SystemIRIx.setProvider(new IRIProviderAny());
// Defaults.
// DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(location);
DatasetGraph dsg = getDatasetGraph(location);
ProgressMonitor monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Data, "Data", BulkLoaderX.DataTick, BulkLoaderX.DataSuperTick);
// WriteRows does it's own buffering and has direct write-to-buffer.
// Do not buffer here.
// Adds gzip processing if required.
// But we'll need the disk space eventually so we aren't space constrained to use gzip here.
OutputStream outputTriples = IO.openOutputFile(loaderFiles.triplesFile);
OutputStream outputQuads = IO.openOutputFile(loaderFiles.quadsFile);
OutputStream outT = outputTriples;
OutputStream outQ = outputQuads;
dsg.executeWrite(() -> {
Pair<Long, Long> p = build(dsg, monitor, outT, outQ, datafiles);
String str = DateTimeUtils.nowAsXSDDateTimeString();
long cTriple = p.getLeft();
long cQuad = p.getRight();
FmtLog.info(BulkLoaderX.LOG_Data, "Triples = %,d ; Quads = %,d", cTriple, cQuad);
JsonObject obj = JSON.buildObject(b -> {
b.pair("ingested", str);
b.key("data").startArray();
datafiles.forEach(fn -> b.value(fn));
b.finishArray();
b.pair("triples", cTriple);
b.pair("quads", cQuad);
});
try (OutputStream out = IO.openOutputFile(loaderFiles.loadInfo)) {
JSON.write(out, obj);
} catch (IOException ex) {
IO.exception(ex);
}
});
TDBInternal.expel(dsg);
SystemIRIx.setProvider(provider);
}
Aggregations