use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.
the class ProcBuildNodeTableX method exec2.
/**
* Pair<triples, indexed nodes>
* @param sortThreads
*/
// [BULK] Output, not return.
private static Pair<Long, Long> exec2(String DB, XLoaderFiles loaderFiles, int sortThreads, String sortNodeTableArgs, List<String> datafiles) {
// Threads - 1 parser, 1 builder, 2 sort.
// Steps:
// 1 - parser to and pipe terms to sort
// 2 - sort
// 3 - build node table from unique sort
IRIProvider provider = SystemIRIx.getProvider();
// SystemIRIx.setProvider(new IRIProviderAny());
DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(DB);
DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
NodeTable nt = dsgtdb.getTripleTable().getNodeTupleTable().getNodeTable();
NodeTableTRDF nodeTable = (NodeTableTRDF) nt.baseNodeTable();
OutputStream toSortOutputStream;
InputStream fromSortInputStream;
if (sortThreads <= 0)
sortThreads = 2;
// ** Step 2: The sort
Process procSort;
try {
// LOG.info("Step : external sort");
// Mutable list.
List<String> sortCmd = new ArrayList<>(Arrays.asList("sort", "--temporary-directory=" + loaderFiles.TMPDIR, "--buffer-size=50%", "--parallel=" + sortThreads, "--unique", "--key=1,1"));
if (BulkLoaderX.CompressSortNodeTableFiles)
sortCmd.add("--compress-program=" + BulkLoaderX.gzipProgram());
// if ( sortNodeTableArgs != null ) {}
ProcessBuilder pb2 = new ProcessBuilder(sortCmd);
pb2.environment().put("LC_ALL", "C");
procSort = pb2.start();
// To process.
// Let the writer close it.
toSortOutputStream = procSort.getOutputStream();
// From process to the tree builder.
// Let the reader side close it.
fromSortInputStream = procSort.getInputStream();
// // Debug sort process.
// InputStream fromSortErrortStream = proc2.getErrorStream();
// IOUtils.copy(fromSortErrortStream, System.err);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
// ** Step 1 : write intermediate file (hash, thrift bytes).
AtomicLong countParseTicks = new AtomicLong(-1);
AtomicLong countIndexedNodes = new AtomicLong(-1);
long tickPoint = BulkLoaderX.DataTick;
int superTick = BulkLoaderX.DataSuperTick;
Runnable task1 = () -> {
ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Nodes, "Nodes", tickPoint, superTick);
OutputStream output = IO.ensureBuffered(toSortOutputStream);
// Counting.
StreamRDF worker = new NodeHashTmpStream(output);
ProgressStreamRDF stream = new ProgressStreamRDF(worker, monitor);
monitor.start();
String label = monitor.getLabel();
datafiles.forEach(datafile -> {
String basename = FileOps.basename(datafile);
monitor.setLabel(basename);
stream.start();
RDFParser.source(datafile).parse(stream);
stream.finish();
});
monitor.finish();
monitor.setLabel(label);
IO.flush(output);
IO.close(output);
long x = monitor.getTime();
// long x = timer.endTimer();
long count = monitor.getTicks();
countParseTicks.set(count);
double xSec = x / 1000.0;
double rate = count / xSec;
FmtLog.info(BulkLoaderX.LOG_Nodes, "%s Parse (nodes): %s seconds : %,d triples/quads %,.0f TPS", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rate);
};
// [BULK] XXX AsyncParser.asyncParse(files, output)
Thread thread1 = async(task1, "AsyncParser");
// Step3: build node table.
Runnable task3 = () -> {
Timer timer = new Timer();
// Don't start timer until sort send something
// Process stream are already buffered.
InputStream input = IO.ensureBuffered(fromSortInputStream);
FileSet fileSet = new FileSet(dsgtdb.getLocation(), Names.nodeTableBaseName);
BufferChannel blkState = FileFactory.createBufferChannel(fileSet, Names.extBptState);
long idxTickPoint = BulkLoaderX.DataTick;
int idxSuperTick = BulkLoaderX.DataSuperTick;
ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Terms, "Index", idxTickPoint, idxSuperTick);
// Library of tools!
dsg.executeWrite(() -> {
BinaryDataFile objectFile = nodeTable.getData();
Iterator<Record> rIter = records(BulkLoaderX.LOG_Terms, input, objectFile);
rIter = new ProgressIterator<>(rIter, monitor);
// Record of (hash, nodeId)
BPlusTree bpt1 = (BPlusTree) (nodeTable.getIndex());
BPlusTreeParams bptParams = bpt1.getParams();
RecordFactory factory = new RecordFactory(SystemTDB.LenNodeHash, NodeId.SIZE);
// Wait until something has been received from the sort step
rIter.hasNext();
monitor.start();
// .. then start the timer. It is closed after the transaction finishes.
timer.startTimer();
BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(rIter, bptParams, factory, blkState, bpt1.getNodeManager().getBlockMgr(), bpt1.getRecordsMgr().getBlockMgr());
bpt2.sync();
bpt1.sync();
objectFile.sync();
monitor.finish();
});
blkState.sync();
IO.close(input);
long x = timer.endTimer();
long count = monitor.getTicks();
countIndexedNodes.set(count);
String rateStr = BulkLoaderX.rateStr(count, x);
FmtLog.info(BulkLoaderX.LOG_Terms, "%s Index terms: %s seconds : %,d indexed RDF terms : %s PerSecond", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rateStr);
};
Thread thread3 = async(task3, "AsyncBuild");
try {
int exitCode = procSort.waitFor();
if (exitCode != 0) {
String msg = IO.readWholeFileAsUTF8(procSort.getErrorStream());
String logMsg = String.format("Sort RC = %d : Error: %s", exitCode, msg);
Log.error(BulkLoaderX.LOG_Terms, logMsg);
// ** Exit process
System.exit(exitCode);
} else
BulkLoaderX.LOG_Terms.info("Sort finished");
// I/O Stream toSortOutputStream and fromSortInputStream closed by
// their users - step 1 and step 3.
} catch (InterruptedException e) {
BulkLoaderX.LOG_Nodes.error("Failed to cleanly wait-for the subprocess");
throw new RuntimeException(e);
}
BulkLoaderX.waitFor(thread1);
BulkLoaderX.waitFor(thread3);
return Pair.create(countParseTicks.get(), countIndexedNodes.get());
}
use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.
the class LoaderOps method ntDataFile.
/**
* Get the node storage for a {@Link NodeTable}
*/
public static TransBinaryDataFile ntDataFile(NodeTable nt) {
NodeTableTRDF ntt = (NodeTableTRDF) (nt.baseNodeTable());
BinaryDataFile bdf = ntt.getData();
TransBinaryDataFile tbdf = (TransBinaryDataFile) bdf;
return tbdf;
}
use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.
the class tdbdumpnodes method main.
public static void main(String... args) throws TException {
if (args.length != 1) {
System.err.println("Usage: tdbdumpnodes NodeFile -- e.g \"Database2/Data-0001/nodes-data.dat\"");
System.exit(1);
}
String FN = args[0];
URLConnection x;
BinaryDataFile f = new BinaryDataFileRandomAccess(FN);
f.open();
TReadAppendFileTransport transport = new TReadAppendFileTransport(f);
TProtocol protocol = TRDF.protocol(transport);
transport.readPosition(0);
// [0x 1BFEA0FD]
// <http://data.europa.eu/esco/occupation/99492920-e5a5-4dba-9e5a-93193147198c>
// [0x 1BFEA14C] ** Bad read ** don't know what type: 14
// transport.readPosition(0x1BFEA0FD);
long limit = f.length();
// limit = 0x1C2092FF;
System.out.printf("File length: %,d [0x%16X]\n", limit, limit);
while (true) {
long locn = transport.readPosition();
if (locn >= limit)
break;
try {
Node n = readOne(protocol);
System.out.printf("[0x%16X] %s\n", locn, FmtUtils.stringForNode(n));
} catch (Exception ex) {
System.out.printf("[0x%16X] ** Bad read ** %s\n", locn, ex.getMessage());
long jump = 100;
long i = locn;
for (; i < locn + jump; i++) {
transport.readPosition(i);
try {
Node n = readOne(protocol);
System.out.printf("Resync: %,d [0x%16X] ==> [0x%16X]\n", i - locn, locn, i);
System.out.printf("[0x%16X] ** %s\n", locn, FmtUtils.stringForNode(n));
} catch (Exception ex2) {
}
}
if (locn - i >= jump)
System.out.printf("No resync: %,d [0x%16X] ==> [0x%16X]\n", i - locn, locn, i);
// // Problems - back up and dump.
// byte bytes[] = new byte[256];
// int len = f.read(locn, bytes);
// StringBuilder sBuff = new StringBuilder() ;
// for ( int i = 0 ; i < len ; i++ ) {
// byte b = bytes[i] ;
// int hi = (b & 0xF0) >> 4 ;
// int lo = b & 0xF ;
// if ( i != 0 ) {
// if (i % 20 == 0 )
// sBuff.append("\n");
// else
// sBuff.append(" ");
// }
// sBuff.append(Chars.hexDigitsUC[hi]) ;
// sBuff.append(Chars.hexDigitsUC[lo]) ;
// }
// String str = sBuff.toString();
// if ( !str.endsWith("\n") )
// str = str+"\n";
// System.out.print(str);
// System.exit(1);
}
}
}
Aggregations