Search in sources :

Example 1 with IRIProvider

use of org.apache.jena.irix.IRIProvider in project jena by apache.

the class ProcNodeTableDataBuilder method exec.

public static void exec(Location location, String dataFileTriples, String dataFileQuads, List<String> datafiles, boolean collectStats) {
    // Possible parser speed up. This has no effect if parsing in parallel
    // because the parser isn't the slowest step in loading at scale.
    IRIProvider provider = SystemIRIx.getProvider();
    // SystemIRIx.setProvider(new IRIProviderAny());
    // This formats the location correctly.
    // But we're not really interested in it all.
    DatasetGraphTDB dsg = DatasetBuilderStd.create(location);
    // so close indexes and the prefix table.
    dsg.getTripleTable().getNodeTupleTable().getTupleTable().close();
    dsg.getQuadTable().getNodeTupleTable().getTupleTable().close();
    ProgressMonitor monitor = ProgressMonitorOutput.create(cmdLog, "Data", BulkLoader.DataTickPoint, BulkLoader.superTick);
    // WriteRows does it's own buffering and has direct write-to-buffer.
    // Do not buffer here.
    OutputStream outputTriples = IO.openOutputFile(dataFileTriples);
    OutputStream outputQuads = IO.openOutputFile(dataFileQuads);
    build(dsg, monitor, outputTriples, outputQuads, datafiles);
    TDBInternal.expel(dsg);
    SystemIRIx.setProvider(provider);
}
Also used : ProgressMonitor(org.apache.jena.system.progress.ProgressMonitor) OutputStream(java.io.OutputStream) IRIProvider(org.apache.jena.irix.IRIProvider) DatasetGraphTDB(org.apache.jena.tdb.store.DatasetGraphTDB)

Example 2 with IRIProvider

use of org.apache.jena.irix.IRIProvider in project jena by apache.

the class ProcBuildNodeTableX method exec2.

/**
 * Pair<triples, indexed nodes>
 * @param sortThreads
 */
// [BULK] Output, not return.
private static Pair<Long, Long> exec2(String DB, XLoaderFiles loaderFiles, int sortThreads, String sortNodeTableArgs, List<String> datafiles) {
    // Threads - 1 parser, 1 builder, 2 sort.
    // Steps:
    // 1 - parser to and pipe terms to sort
    // 2 - sort
    // 3 - build node table from unique sort
    IRIProvider provider = SystemIRIx.getProvider();
    // SystemIRIx.setProvider(new IRIProviderAny());
    DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(DB);
    DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
    NodeTable nt = dsgtdb.getTripleTable().getNodeTupleTable().getNodeTable();
    NodeTableTRDF nodeTable = (NodeTableTRDF) nt.baseNodeTable();
    OutputStream toSortOutputStream;
    InputStream fromSortInputStream;
    if (sortThreads <= 0)
        sortThreads = 2;
    // ** Step 2: The sort
    Process procSort;
    try {
        // LOG.info("Step : external sort");
        // Mutable list.
        List<String> sortCmd = new ArrayList<>(Arrays.asList("sort", "--temporary-directory=" + loaderFiles.TMPDIR, "--buffer-size=50%", "--parallel=" + sortThreads, "--unique", "--key=1,1"));
        if (BulkLoaderX.CompressSortNodeTableFiles)
            sortCmd.add("--compress-program=" + BulkLoaderX.gzipProgram());
        // if ( sortNodeTableArgs != null ) {}
        ProcessBuilder pb2 = new ProcessBuilder(sortCmd);
        pb2.environment().put("LC_ALL", "C");
        procSort = pb2.start();
        // To process.
        // Let the writer close it.
        toSortOutputStream = procSort.getOutputStream();
        // From process to the tree builder.
        // Let the reader side close it.
        fromSortInputStream = procSort.getInputStream();
    // // Debug sort process.
    // InputStream fromSortErrortStream = proc2.getErrorStream();
    // IOUtils.copy(fromSortErrortStream, System.err);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    // ** Step 1 : write intermediate file (hash, thrift bytes).
    AtomicLong countParseTicks = new AtomicLong(-1);
    AtomicLong countIndexedNodes = new AtomicLong(-1);
    long tickPoint = BulkLoaderX.DataTick;
    int superTick = BulkLoaderX.DataSuperTick;
    Runnable task1 = () -> {
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Nodes, "Nodes", tickPoint, superTick);
        OutputStream output = IO.ensureBuffered(toSortOutputStream);
        // Counting.
        StreamRDF worker = new NodeHashTmpStream(output);
        ProgressStreamRDF stream = new ProgressStreamRDF(worker, monitor);
        monitor.start();
        String label = monitor.getLabel();
        datafiles.forEach(datafile -> {
            String basename = FileOps.basename(datafile);
            monitor.setLabel(basename);
            stream.start();
            RDFParser.source(datafile).parse(stream);
            stream.finish();
        });
        monitor.finish();
        monitor.setLabel(label);
        IO.flush(output);
        IO.close(output);
        long x = monitor.getTime();
        // long x = timer.endTimer();
        long count = monitor.getTicks();
        countParseTicks.set(count);
        double xSec = x / 1000.0;
        double rate = count / xSec;
        FmtLog.info(BulkLoaderX.LOG_Nodes, "%s Parse (nodes): %s seconds : %,d triples/quads %,.0f TPS", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rate);
    };
    // [BULK] XXX AsyncParser.asyncParse(files, output)
    Thread thread1 = async(task1, "AsyncParser");
    // Step3: build node table.
    Runnable task3 = () -> {
        Timer timer = new Timer();
        // Don't start timer until sort send something
        // Process stream are already buffered.
        InputStream input = IO.ensureBuffered(fromSortInputStream);
        FileSet fileSet = new FileSet(dsgtdb.getLocation(), Names.nodeTableBaseName);
        BufferChannel blkState = FileFactory.createBufferChannel(fileSet, Names.extBptState);
        long idxTickPoint = BulkLoaderX.DataTick;
        int idxSuperTick = BulkLoaderX.DataSuperTick;
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Terms, "Index", idxTickPoint, idxSuperTick);
        // Library of tools!
        dsg.executeWrite(() -> {
            BinaryDataFile objectFile = nodeTable.getData();
            Iterator<Record> rIter = records(BulkLoaderX.LOG_Terms, input, objectFile);
            rIter = new ProgressIterator<>(rIter, monitor);
            // Record of (hash, nodeId)
            BPlusTree bpt1 = (BPlusTree) (nodeTable.getIndex());
            BPlusTreeParams bptParams = bpt1.getParams();
            RecordFactory factory = new RecordFactory(SystemTDB.LenNodeHash, NodeId.SIZE);
            // Wait until something has been received from the sort step
            rIter.hasNext();
            monitor.start();
            // .. then start the timer. It is closed after the transaction finishes.
            timer.startTimer();
            BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(rIter, bptParams, factory, blkState, bpt1.getNodeManager().getBlockMgr(), bpt1.getRecordsMgr().getBlockMgr());
            bpt2.sync();
            bpt1.sync();
            objectFile.sync();
            monitor.finish();
        });
        blkState.sync();
        IO.close(input);
        long x = timer.endTimer();
        long count = monitor.getTicks();
        countIndexedNodes.set(count);
        String rateStr = BulkLoaderX.rateStr(count, x);
        FmtLog.info(BulkLoaderX.LOG_Terms, "%s Index terms: %s seconds : %,d indexed RDF terms : %s PerSecond", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rateStr);
    };
    Thread thread3 = async(task3, "AsyncBuild");
    try {
        int exitCode = procSort.waitFor();
        if (exitCode != 0) {
            String msg = IO.readWholeFileAsUTF8(procSort.getErrorStream());
            String logMsg = String.format("Sort RC = %d : Error: %s", exitCode, msg);
            Log.error(BulkLoaderX.LOG_Terms, logMsg);
            // ** Exit process
            System.exit(exitCode);
        } else
            BulkLoaderX.LOG_Terms.info("Sort finished");
    // I/O Stream toSortOutputStream and fromSortInputStream closed by
    // their users - step 1 and step 3.
    } catch (InterruptedException e) {
        BulkLoaderX.LOG_Nodes.error("Failed to cleanly wait-for the subprocess");
        throw new RuntimeException(e);
    }
    BulkLoaderX.waitFor(thread1);
    BulkLoaderX.waitFor(thread3);
    return Pair.create(countParseTicks.get(), countIndexedNodes.get());
}
Also used : Arrays(java.util.Arrays) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) IO(org.apache.jena.atlas.io.IO) FileFactory(org.apache.jena.dboe.base.file.FileFactory) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) NodeId(org.apache.jena.tdb2.store.NodeId) FileSet(org.apache.jena.dboe.base.file.FileSet) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) TSerializer(org.apache.thrift.TSerializer) RDF_Term(org.apache.jena.riot.thrift.wire.RDF_Term) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) Log(org.apache.jena.atlas.logging.Log) BulkLoaderX.async(org.apache.jena.tdb2.xloader.BulkLoaderX.async) TCompactProtocol(org.apache.thrift.protocol.TCompactProtocol) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) Names(org.apache.jena.dboe.sys.Names) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) SystemTDB(org.apache.jena.tdb2.sys.SystemTDB) Triple(org.apache.jena.graph.Triple) DatabaseMgr(org.apache.jena.tdb2.DatabaseMgr) NodeIdFactory(org.apache.jena.tdb2.store.NodeIdFactory) List(java.util.List) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) ThriftConvert(org.apache.jena.riot.thrift.ThriftConvert) BPlusTreeRewriter(org.apache.jena.dboe.trans.bplustree.rewriter.BPlusTreeRewriter) Record(org.apache.jena.dboe.base.record.Record) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TDBInternal(org.apache.jena.tdb2.sys.TDBInternal) ArrayList(java.util.ArrayList) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) Hash(org.apache.jena.tdb2.store.Hash) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) Quad(org.apache.jena.sparql.core.Quad) NodeLib(org.apache.jena.tdb2.lib.NodeLib) OutputStream(java.io.OutputStream) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) RDFParser(org.apache.jena.riot.RDFParser) TException(org.apache.thrift.TException) IOException(java.io.IOException) StreamRDF(org.apache.jena.riot.system.StreamRDF) SystemIRIx(org.apache.jena.irix.SystemIRIx) FmtLog(org.apache.jena.atlas.logging.FmtLog) org.apache.jena.atlas.lib(org.apache.jena.atlas.lib) AtomicLong(java.util.concurrent.atomic.AtomicLong) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) IteratorSlotted(org.apache.jena.atlas.iterator.IteratorSlotted) Node(org.apache.jena.graph.Node) IRIProvider(org.apache.jena.irix.IRIProvider) InputStream(java.io.InputStream) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) StreamRDF(org.apache.jena.riot.system.StreamRDF) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) Iterator(java.util.Iterator) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) FileSet(org.apache.jena.dboe.base.file.FileSet) InputStream(java.io.InputStream) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) IRIProvider(org.apache.jena.irix.IRIProvider) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) TException(org.apache.thrift.TException) IOException(java.io.IOException) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree)

Example 3 with IRIProvider

use of org.apache.jena.irix.IRIProvider in project jena by apache.

the class ProcIngestDataX method exec.

// Node Table.
public static void exec(String location, XLoaderFiles loaderFiles, List<String> datafiles, boolean collectStats) {
    FmtLog.info(BulkLoaderX.LOG_Data, "Ingest data");
    // Possible parser speed up. This has no effect if parsing in parallel
    // because the parser isn't the slowest step when loading at scale.
    IRIProvider provider = SystemIRIx.getProvider();
    // SystemIRIx.setProvider(new IRIProviderAny());
    // Defaults.
    // DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(location);
    DatasetGraph dsg = getDatasetGraph(location);
    ProgressMonitor monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Data, "Data", BulkLoaderX.DataTick, BulkLoaderX.DataSuperTick);
    // WriteRows does it's own buffering and has direct write-to-buffer.
    // Do not buffer here.
    // Adds gzip processing if required.
    // But we'll need the disk space eventually so we aren't space constrained to use gzip here.
    OutputStream outputTriples = IO.openOutputFile(loaderFiles.triplesFile);
    OutputStream outputQuads = IO.openOutputFile(loaderFiles.quadsFile);
    OutputStream outT = outputTriples;
    OutputStream outQ = outputQuads;
    dsg.executeWrite(() -> {
        Pair<Long, Long> p = build(dsg, monitor, outT, outQ, datafiles);
        String str = DateTimeUtils.nowAsXSDDateTimeString();
        long cTriple = p.getLeft();
        long cQuad = p.getRight();
        FmtLog.info(BulkLoaderX.LOG_Data, "Triples = %,d ; Quads = %,d", cTriple, cQuad);
        JsonObject obj = JSON.buildObject(b -> {
            b.pair("ingested", str);
            b.key("data").startArray();
            datafiles.forEach(fn -> b.value(fn));
            b.finishArray();
            b.pair("triples", cTriple);
            b.pair("quads", cQuad);
        });
        try (OutputStream out = IO.openOutputFile(loaderFiles.loadInfo)) {
            JSON.write(out, obj);
        } catch (IOException ex) {
            IO.exception(ex);
        }
    });
    TDBInternal.expel(dsg);
    SystemIRIx.setProvider(provider);
}
Also used : ProgressMonitor(org.apache.jena.system.progress.ProgressMonitor) OutputStream(java.io.OutputStream) BitsLong(org.apache.jena.atlas.lib.BitsLong) JsonObject(org.apache.jena.atlas.json.JsonObject) IOException(java.io.IOException) IRIProvider(org.apache.jena.irix.IRIProvider) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph)

Aggregations

OutputStream (java.io.OutputStream)3 IRIProvider (org.apache.jena.irix.IRIProvider)3 IOException (java.io.IOException)2 DatasetGraph (org.apache.jena.sparql.core.DatasetGraph)2 ProgressMonitor (org.apache.jena.system.progress.ProgressMonitor)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1 List (java.util.List)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 IO (org.apache.jena.atlas.io.IO)1 IteratorSlotted (org.apache.jena.atlas.iterator.IteratorSlotted)1 JsonObject (org.apache.jena.atlas.json.JsonObject)1 org.apache.jena.atlas.lib (org.apache.jena.atlas.lib)1 BitsLong (org.apache.jena.atlas.lib.BitsLong)1 FmtLog (org.apache.jena.atlas.logging.FmtLog)1 Log (org.apache.jena.atlas.logging.Log)1 BinaryDataFile (org.apache.jena.dboe.base.file.BinaryDataFile)1