Search in sources :

Example 1 with BinaryDataFile

use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.

the class ProcBuildNodeTableX method exec2.

/**
 * Pair<triples, indexed nodes>
 * @param sortThreads
 */
// [BULK] Output, not return.
private static Pair<Long, Long> exec2(String DB, XLoaderFiles loaderFiles, int sortThreads, String sortNodeTableArgs, List<String> datafiles) {
    // Threads - 1 parser, 1 builder, 2 sort.
    // Steps:
    // 1 - parser to and pipe terms to sort
    // 2 - sort
    // 3 - build node table from unique sort
    IRIProvider provider = SystemIRIx.getProvider();
    // SystemIRIx.setProvider(new IRIProviderAny());
    DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(DB);
    DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
    NodeTable nt = dsgtdb.getTripleTable().getNodeTupleTable().getNodeTable();
    NodeTableTRDF nodeTable = (NodeTableTRDF) nt.baseNodeTable();
    OutputStream toSortOutputStream;
    InputStream fromSortInputStream;
    if (sortThreads <= 0)
        sortThreads = 2;
    // ** Step 2: The sort
    Process procSort;
    try {
        // LOG.info("Step : external sort");
        // Mutable list.
        List<String> sortCmd = new ArrayList<>(Arrays.asList("sort", "--temporary-directory=" + loaderFiles.TMPDIR, "--buffer-size=50%", "--parallel=" + sortThreads, "--unique", "--key=1,1"));
        if (BulkLoaderX.CompressSortNodeTableFiles)
            sortCmd.add("--compress-program=" + BulkLoaderX.gzipProgram());
        // if ( sortNodeTableArgs != null ) {}
        ProcessBuilder pb2 = new ProcessBuilder(sortCmd);
        pb2.environment().put("LC_ALL", "C");
        procSort = pb2.start();
        // To process.
        // Let the writer close it.
        toSortOutputStream = procSort.getOutputStream();
        // From process to the tree builder.
        // Let the reader side close it.
        fromSortInputStream = procSort.getInputStream();
    // // Debug sort process.
    // InputStream fromSortErrortStream = proc2.getErrorStream();
    // IOUtils.copy(fromSortErrortStream, System.err);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    // ** Step 1 : write intermediate file (hash, thrift bytes).
    AtomicLong countParseTicks = new AtomicLong(-1);
    AtomicLong countIndexedNodes = new AtomicLong(-1);
    long tickPoint = BulkLoaderX.DataTick;
    int superTick = BulkLoaderX.DataSuperTick;
    Runnable task1 = () -> {
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Nodes, "Nodes", tickPoint, superTick);
        OutputStream output = IO.ensureBuffered(toSortOutputStream);
        // Counting.
        StreamRDF worker = new NodeHashTmpStream(output);
        ProgressStreamRDF stream = new ProgressStreamRDF(worker, monitor);
        monitor.start();
        String label = monitor.getLabel();
        datafiles.forEach(datafile -> {
            String basename = FileOps.basename(datafile);
            monitor.setLabel(basename);
            stream.start();
            RDFParser.source(datafile).parse(stream);
            stream.finish();
        });
        monitor.finish();
        monitor.setLabel(label);
        IO.flush(output);
        IO.close(output);
        long x = monitor.getTime();
        // long x = timer.endTimer();
        long count = monitor.getTicks();
        countParseTicks.set(count);
        double xSec = x / 1000.0;
        double rate = count / xSec;
        FmtLog.info(BulkLoaderX.LOG_Nodes, "%s Parse (nodes): %s seconds : %,d triples/quads %,.0f TPS", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rate);
    };
    // [BULK] XXX AsyncParser.asyncParse(files, output)
    Thread thread1 = async(task1, "AsyncParser");
    // Step3: build node table.
    Runnable task3 = () -> {
        Timer timer = new Timer();
        // Don't start timer until sort send something
        // Process stream are already buffered.
        InputStream input = IO.ensureBuffered(fromSortInputStream);
        FileSet fileSet = new FileSet(dsgtdb.getLocation(), Names.nodeTableBaseName);
        BufferChannel blkState = FileFactory.createBufferChannel(fileSet, Names.extBptState);
        long idxTickPoint = BulkLoaderX.DataTick;
        int idxSuperTick = BulkLoaderX.DataSuperTick;
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Terms, "Index", idxTickPoint, idxSuperTick);
        // Library of tools!
        dsg.executeWrite(() -> {
            BinaryDataFile objectFile = nodeTable.getData();
            Iterator<Record> rIter = records(BulkLoaderX.LOG_Terms, input, objectFile);
            rIter = new ProgressIterator<>(rIter, monitor);
            // Record of (hash, nodeId)
            BPlusTree bpt1 = (BPlusTree) (nodeTable.getIndex());
            BPlusTreeParams bptParams = bpt1.getParams();
            RecordFactory factory = new RecordFactory(SystemTDB.LenNodeHash, NodeId.SIZE);
            // Wait until something has been received from the sort step
            rIter.hasNext();
            monitor.start();
            // .. then start the timer. It is closed after the transaction finishes.
            timer.startTimer();
            BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(rIter, bptParams, factory, blkState, bpt1.getNodeManager().getBlockMgr(), bpt1.getRecordsMgr().getBlockMgr());
            bpt2.sync();
            bpt1.sync();
            objectFile.sync();
            monitor.finish();
        });
        blkState.sync();
        IO.close(input);
        long x = timer.endTimer();
        long count = monitor.getTicks();
        countIndexedNodes.set(count);
        String rateStr = BulkLoaderX.rateStr(count, x);
        FmtLog.info(BulkLoaderX.LOG_Terms, "%s Index terms: %s seconds : %,d indexed RDF terms : %s PerSecond", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rateStr);
    };
    Thread thread3 = async(task3, "AsyncBuild");
    try {
        int exitCode = procSort.waitFor();
        if (exitCode != 0) {
            String msg = IO.readWholeFileAsUTF8(procSort.getErrorStream());
            String logMsg = String.format("Sort RC = %d : Error: %s", exitCode, msg);
            Log.error(BulkLoaderX.LOG_Terms, logMsg);
            // ** Exit process
            System.exit(exitCode);
        } else
            BulkLoaderX.LOG_Terms.info("Sort finished");
    // I/O Stream toSortOutputStream and fromSortInputStream closed by
    // their users - step 1 and step 3.
    } catch (InterruptedException e) {
        BulkLoaderX.LOG_Nodes.error("Failed to cleanly wait-for the subprocess");
        throw new RuntimeException(e);
    }
    BulkLoaderX.waitFor(thread1);
    BulkLoaderX.waitFor(thread3);
    return Pair.create(countParseTicks.get(), countIndexedNodes.get());
}
Also used : Arrays(java.util.Arrays) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) IO(org.apache.jena.atlas.io.IO) FileFactory(org.apache.jena.dboe.base.file.FileFactory) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) NodeId(org.apache.jena.tdb2.store.NodeId) FileSet(org.apache.jena.dboe.base.file.FileSet) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) TSerializer(org.apache.thrift.TSerializer) RDF_Term(org.apache.jena.riot.thrift.wire.RDF_Term) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) Log(org.apache.jena.atlas.logging.Log) BulkLoaderX.async(org.apache.jena.tdb2.xloader.BulkLoaderX.async) TCompactProtocol(org.apache.thrift.protocol.TCompactProtocol) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) Names(org.apache.jena.dboe.sys.Names) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) SystemTDB(org.apache.jena.tdb2.sys.SystemTDB) Triple(org.apache.jena.graph.Triple) DatabaseMgr(org.apache.jena.tdb2.DatabaseMgr) NodeIdFactory(org.apache.jena.tdb2.store.NodeIdFactory) List(java.util.List) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) ThriftConvert(org.apache.jena.riot.thrift.ThriftConvert) BPlusTreeRewriter(org.apache.jena.dboe.trans.bplustree.rewriter.BPlusTreeRewriter) Record(org.apache.jena.dboe.base.record.Record) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TDBInternal(org.apache.jena.tdb2.sys.TDBInternal) ArrayList(java.util.ArrayList) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) Hash(org.apache.jena.tdb2.store.Hash) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) Quad(org.apache.jena.sparql.core.Quad) NodeLib(org.apache.jena.tdb2.lib.NodeLib) OutputStream(java.io.OutputStream) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) RDFParser(org.apache.jena.riot.RDFParser) TException(org.apache.thrift.TException) IOException(java.io.IOException) StreamRDF(org.apache.jena.riot.system.StreamRDF) SystemIRIx(org.apache.jena.irix.SystemIRIx) FmtLog(org.apache.jena.atlas.logging.FmtLog) org.apache.jena.atlas.lib(org.apache.jena.atlas.lib) AtomicLong(java.util.concurrent.atomic.AtomicLong) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) IteratorSlotted(org.apache.jena.atlas.iterator.IteratorSlotted) Node(org.apache.jena.graph.Node) IRIProvider(org.apache.jena.irix.IRIProvider) InputStream(java.io.InputStream) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) StreamRDF(org.apache.jena.riot.system.StreamRDF) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) Iterator(java.util.Iterator) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) FileSet(org.apache.jena.dboe.base.file.FileSet) InputStream(java.io.InputStream) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) IRIProvider(org.apache.jena.irix.IRIProvider) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) TException(org.apache.thrift.TException) IOException(java.io.IOException) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree)

Example 2 with BinaryDataFile

use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.

the class LoaderOps method ntDataFile.

/**
 * Get the node storage for a {@Link NodeTable}
 */
public static TransBinaryDataFile ntDataFile(NodeTable nt) {
    NodeTableTRDF ntt = (NodeTableTRDF) (nt.baseNodeTable());
    BinaryDataFile bdf = ntt.getData();
    TransBinaryDataFile tbdf = (TransBinaryDataFile) bdf;
    return tbdf;
}
Also used : NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) TransBinaryDataFile(org.apache.jena.dboe.trans.data.TransBinaryDataFile) TransBinaryDataFile(org.apache.jena.dboe.trans.data.TransBinaryDataFile) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile)

Example 3 with BinaryDataFile

use of org.apache.jena.dboe.base.file.BinaryDataFile in project jena by apache.

the class tdbdumpnodes method main.

public static void main(String... args) throws TException {
    if (args.length != 1) {
        System.err.println("Usage: tdbdumpnodes NodeFile -- e.g \"Database2/Data-0001/nodes-data.dat\"");
        System.exit(1);
    }
    String FN = args[0];
    URLConnection x;
    BinaryDataFile f = new BinaryDataFileRandomAccess(FN);
    f.open();
    TReadAppendFileTransport transport = new TReadAppendFileTransport(f);
    TProtocol protocol = TRDF.protocol(transport);
    transport.readPosition(0);
    // [0x        1BFEA0FD]
    // <http://data.europa.eu/esco/occupation/99492920-e5a5-4dba-9e5a-93193147198c>
    // [0x        1BFEA14C] ** Bad read ** don't know what type: 14
    // transport.readPosition(0x1BFEA0FD);
    long limit = f.length();
    // limit = 0x1C2092FF;
    System.out.printf("File length: %,d [0x%16X]\n", limit, limit);
    while (true) {
        long locn = transport.readPosition();
        if (locn >= limit)
            break;
        try {
            Node n = readOne(protocol);
            System.out.printf("[0x%16X] %s\n", locn, FmtUtils.stringForNode(n));
        } catch (Exception ex) {
            System.out.printf("[0x%16X] ** Bad read ** %s\n", locn, ex.getMessage());
            long jump = 100;
            long i = locn;
            for (; i < locn + jump; i++) {
                transport.readPosition(i);
                try {
                    Node n = readOne(protocol);
                    System.out.printf("Resync: %,d  [0x%16X] ==> [0x%16X]\n", i - locn, locn, i);
                    System.out.printf("[0x%16X] ** %s\n", locn, FmtUtils.stringForNode(n));
                } catch (Exception ex2) {
                }
            }
            if (locn - i >= jump)
                System.out.printf("No resync: %,d  [0x%16X] ==> [0x%16X]\n", i - locn, locn, i);
        // // Problems - back up and dump.
        // byte bytes[] = new byte[256];
        // int len = f.read(locn, bytes);
        // StringBuilder sBuff = new StringBuilder() ;
        // for ( int i = 0 ; i < len ; i++ ) {
        // byte b = bytes[i] ;
        // int hi = (b & 0xF0) >> 4 ;
        // int lo = b & 0xF ;
        // if ( i != 0 ) {
        // if (i % 20 == 0  )
        // sBuff.append("\n");
        // else
        // sBuff.append(" ");
        // }
        // sBuff.append(Chars.hexDigitsUC[hi]) ;
        // sBuff.append(Chars.hexDigitsUC[lo]) ;
        // }
        // String str = sBuff.toString();
        // if ( !str.endsWith("\n") )
        // str = str+"\n";
        // System.out.print(str);
        // System.exit(1);
        }
    }
}
Also used : BinaryDataFileRandomAccess(org.apache.jena.dboe.base.file.BinaryDataFileRandomAccess) TReadAppendFileTransport(org.apache.jena.tdb2.store.nodetable.TReadAppendFileTransport) TProtocol(org.apache.thrift.protocol.TProtocol) Node(org.apache.jena.graph.Node) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) URLConnection(java.net.URLConnection) TException(org.apache.thrift.TException)

Aggregations

BinaryDataFile (org.apache.jena.dboe.base.file.BinaryDataFile)3 Node (org.apache.jena.graph.Node)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 URLConnection (java.net.URLConnection)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1 List (java.util.List)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 IO (org.apache.jena.atlas.io.IO)1 IteratorSlotted (org.apache.jena.atlas.iterator.IteratorSlotted)1 org.apache.jena.atlas.lib (org.apache.jena.atlas.lib)1 FmtLog (org.apache.jena.atlas.logging.FmtLog)1 Log (org.apache.jena.atlas.logging.Log)1 BinaryDataFileRandomAccess (org.apache.jena.dboe.base.file.BinaryDataFileRandomAccess)1 BufferChannel (org.apache.jena.dboe.base.file.BufferChannel)1 FileFactory (org.apache.jena.dboe.base.file.FileFactory)1