Search in sources :

Example 1 with FileSet

use of org.apache.jena.dboe.base.file.FileSet in project jena by apache.

the class TestBPlusTreeRewriterNonTxn method runOneTest.

public static void runOneTest(int order, int N, RecordFactory recordFactory, boolean debug) {
    BPlusTreeParams bptParams = new BPlusTreeParams(order, recordFactory);
    BPlusTreeRewriter.debug = debug;
    // ---- Test data
    List<Record> originaldata = TestBPlusTreeRewriterNonTxn.createData(N, recordFactory);
    if (debug)
        System.out.println("Test data: " + originaldata);
    FileSet destination = FileSet.mem();
    // ---- Rewrite
    BufferChannel rootState = FileFactory.createBufferChannel(destination, Names.extBptState);
    // Write leaves to ...
    BlockMgr blkMgr1 = BlockMgrFactory.create(destination, Names.extBptTree, bptParams.getCalcBlockSize(), 10, 10);
    // Write nodes to ...
    BlockMgr blkMgr2 = BlockMgrFactory.create(destination, Names.extBptTree, bptParams.getCalcBlockSize(), 10, 10);
    BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(originaldata.iterator(), bptParams, recordFactory, rootState, blkMgr1, blkMgr2);
    if (debug) {
        BPlusTreeRewriterUtils.divider();
        bpt2.dump();
    }
    // ---- Checking
    bpt2.check();
    scanComparision(originaldata, bpt2);
    findComparison(originaldata, bpt2);
    sizeComparison(originaldata, bpt2);
}
Also used : BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) FileSet(org.apache.jena.dboe.base.file.FileSet) BlockMgr(org.apache.jena.dboe.base.block.BlockMgr) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) Record(org.apache.jena.dboe.base.record.Record) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree)

Example 2 with FileSet

use of org.apache.jena.dboe.base.file.FileSet in project jena by apache.

the class ProcBuildNodeTableX method exec2.

/**
 * Pair<triples, indexed nodes>
 * @param sortThreads
 */
// [BULK] Output, not return.
private static Pair<Long, Long> exec2(String DB, XLoaderFiles loaderFiles, int sortThreads, String sortNodeTableArgs, List<String> datafiles) {
    // Threads - 1 parser, 1 builder, 2 sort.
    // Steps:
    // 1 - parser to and pipe terms to sort
    // 2 - sort
    // 3 - build node table from unique sort
    IRIProvider provider = SystemIRIx.getProvider();
    // SystemIRIx.setProvider(new IRIProviderAny());
    DatasetGraph dsg = DatabaseMgr.connectDatasetGraph(DB);
    DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
    NodeTable nt = dsgtdb.getTripleTable().getNodeTupleTable().getNodeTable();
    NodeTableTRDF nodeTable = (NodeTableTRDF) nt.baseNodeTable();
    OutputStream toSortOutputStream;
    InputStream fromSortInputStream;
    if (sortThreads <= 0)
        sortThreads = 2;
    // ** Step 2: The sort
    Process procSort;
    try {
        // LOG.info("Step : external sort");
        // Mutable list.
        List<String> sortCmd = new ArrayList<>(Arrays.asList("sort", "--temporary-directory=" + loaderFiles.TMPDIR, "--buffer-size=50%", "--parallel=" + sortThreads, "--unique", "--key=1,1"));
        if (BulkLoaderX.CompressSortNodeTableFiles)
            sortCmd.add("--compress-program=" + BulkLoaderX.gzipProgram());
        // if ( sortNodeTableArgs != null ) {}
        ProcessBuilder pb2 = new ProcessBuilder(sortCmd);
        pb2.environment().put("LC_ALL", "C");
        procSort = pb2.start();
        // To process.
        // Let the writer close it.
        toSortOutputStream = procSort.getOutputStream();
        // From process to the tree builder.
        // Let the reader side close it.
        fromSortInputStream = procSort.getInputStream();
    // // Debug sort process.
    // InputStream fromSortErrortStream = proc2.getErrorStream();
    // IOUtils.copy(fromSortErrortStream, System.err);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    // ** Step 1 : write intermediate file (hash, thrift bytes).
    AtomicLong countParseTicks = new AtomicLong(-1);
    AtomicLong countIndexedNodes = new AtomicLong(-1);
    long tickPoint = BulkLoaderX.DataTick;
    int superTick = BulkLoaderX.DataSuperTick;
    Runnable task1 = () -> {
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Nodes, "Nodes", tickPoint, superTick);
        OutputStream output = IO.ensureBuffered(toSortOutputStream);
        // Counting.
        StreamRDF worker = new NodeHashTmpStream(output);
        ProgressStreamRDF stream = new ProgressStreamRDF(worker, monitor);
        monitor.start();
        String label = monitor.getLabel();
        datafiles.forEach(datafile -> {
            String basename = FileOps.basename(datafile);
            monitor.setLabel(basename);
            stream.start();
            RDFParser.source(datafile).parse(stream);
            stream.finish();
        });
        monitor.finish();
        monitor.setLabel(label);
        IO.flush(output);
        IO.close(output);
        long x = monitor.getTime();
        // long x = timer.endTimer();
        long count = monitor.getTicks();
        countParseTicks.set(count);
        double xSec = x / 1000.0;
        double rate = count / xSec;
        FmtLog.info(BulkLoaderX.LOG_Nodes, "%s Parse (nodes): %s seconds : %,d triples/quads %,.0f TPS", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rate);
    };
    // [BULK] XXX AsyncParser.asyncParse(files, output)
    Thread thread1 = async(task1, "AsyncParser");
    // Step3: build node table.
    Runnable task3 = () -> {
        Timer timer = new Timer();
        // Don't start timer until sort send something
        // Process stream are already buffered.
        InputStream input = IO.ensureBuffered(fromSortInputStream);
        FileSet fileSet = new FileSet(dsgtdb.getLocation(), Names.nodeTableBaseName);
        BufferChannel blkState = FileFactory.createBufferChannel(fileSet, Names.extBptState);
        long idxTickPoint = BulkLoaderX.DataTick;
        int idxSuperTick = BulkLoaderX.DataSuperTick;
        ProgressMonitorOutput monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Terms, "Index", idxTickPoint, idxSuperTick);
        // Library of tools!
        dsg.executeWrite(() -> {
            BinaryDataFile objectFile = nodeTable.getData();
            Iterator<Record> rIter = records(BulkLoaderX.LOG_Terms, input, objectFile);
            rIter = new ProgressIterator<>(rIter, monitor);
            // Record of (hash, nodeId)
            BPlusTree bpt1 = (BPlusTree) (nodeTable.getIndex());
            BPlusTreeParams bptParams = bpt1.getParams();
            RecordFactory factory = new RecordFactory(SystemTDB.LenNodeHash, NodeId.SIZE);
            // Wait until something has been received from the sort step
            rIter.hasNext();
            monitor.start();
            // .. then start the timer. It is closed after the transaction finishes.
            timer.startTimer();
            BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(rIter, bptParams, factory, blkState, bpt1.getNodeManager().getBlockMgr(), bpt1.getRecordsMgr().getBlockMgr());
            bpt2.sync();
            bpt1.sync();
            objectFile.sync();
            monitor.finish();
        });
        blkState.sync();
        IO.close(input);
        long x = timer.endTimer();
        long count = monitor.getTicks();
        countIndexedNodes.set(count);
        String rateStr = BulkLoaderX.rateStr(count, x);
        FmtLog.info(BulkLoaderX.LOG_Terms, "%s Index terms: %s seconds : %,d indexed RDF terms : %s PerSecond", BulkLoaderX.StageMarker, Timer.timeStr(x), count, rateStr);
    };
    Thread thread3 = async(task3, "AsyncBuild");
    try {
        int exitCode = procSort.waitFor();
        if (exitCode != 0) {
            String msg = IO.readWholeFileAsUTF8(procSort.getErrorStream());
            String logMsg = String.format("Sort RC = %d : Error: %s", exitCode, msg);
            Log.error(BulkLoaderX.LOG_Terms, logMsg);
            // ** Exit process
            System.exit(exitCode);
        } else
            BulkLoaderX.LOG_Terms.info("Sort finished");
    // I/O Stream toSortOutputStream and fromSortInputStream closed by
    // their users - step 1 and step 3.
    } catch (InterruptedException e) {
        BulkLoaderX.LOG_Nodes.error("Failed to cleanly wait-for the subprocess");
        throw new RuntimeException(e);
    }
    BulkLoaderX.waitFor(thread1);
    BulkLoaderX.waitFor(thread3);
    return Pair.create(countParseTicks.get(), countIndexedNodes.get());
}
Also used : Arrays(java.util.Arrays) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) IO(org.apache.jena.atlas.io.IO) FileFactory(org.apache.jena.dboe.base.file.FileFactory) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) NodeId(org.apache.jena.tdb2.store.NodeId) FileSet(org.apache.jena.dboe.base.file.FileSet) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) TSerializer(org.apache.thrift.TSerializer) RDF_Term(org.apache.jena.riot.thrift.wire.RDF_Term) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) Log(org.apache.jena.atlas.logging.Log) BulkLoaderX.async(org.apache.jena.tdb2.xloader.BulkLoaderX.async) TCompactProtocol(org.apache.thrift.protocol.TCompactProtocol) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) Names(org.apache.jena.dboe.sys.Names) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) SystemTDB(org.apache.jena.tdb2.sys.SystemTDB) Triple(org.apache.jena.graph.Triple) DatabaseMgr(org.apache.jena.tdb2.DatabaseMgr) NodeIdFactory(org.apache.jena.tdb2.store.NodeIdFactory) List(java.util.List) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) ThriftConvert(org.apache.jena.riot.thrift.ThriftConvert) BPlusTreeRewriter(org.apache.jena.dboe.trans.bplustree.rewriter.BPlusTreeRewriter) Record(org.apache.jena.dboe.base.record.Record) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TDBInternal(org.apache.jena.tdb2.sys.TDBInternal) ArrayList(java.util.ArrayList) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) Hash(org.apache.jena.tdb2.store.Hash) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) Quad(org.apache.jena.sparql.core.Quad) NodeLib(org.apache.jena.tdb2.lib.NodeLib) OutputStream(java.io.OutputStream) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) RDFParser(org.apache.jena.riot.RDFParser) TException(org.apache.thrift.TException) IOException(java.io.IOException) StreamRDF(org.apache.jena.riot.system.StreamRDF) SystemIRIx(org.apache.jena.irix.SystemIRIx) FmtLog(org.apache.jena.atlas.logging.FmtLog) org.apache.jena.atlas.lib(org.apache.jena.atlas.lib) AtomicLong(java.util.concurrent.atomic.AtomicLong) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) IteratorSlotted(org.apache.jena.atlas.iterator.IteratorSlotted) Node(org.apache.jena.graph.Node) IRIProvider(org.apache.jena.irix.IRIProvider) InputStream(java.io.InputStream) BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) DatasetGraph(org.apache.jena.sparql.core.DatasetGraph) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) StreamRDF(org.apache.jena.riot.system.StreamRDF) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) Iterator(java.util.Iterator) NodeTable(org.apache.jena.tdb2.store.nodetable.NodeTable) NodeTableTRDF(org.apache.jena.tdb2.store.nodetable.NodeTableTRDF) FileSet(org.apache.jena.dboe.base.file.FileSet) InputStream(java.io.InputStream) ProgressMonitorOutput(org.apache.jena.system.progress.ProgressMonitorOutput) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) BinaryDataFile(org.apache.jena.dboe.base.file.BinaryDataFile) IRIProvider(org.apache.jena.irix.IRIProvider) RiotThriftException(org.apache.jena.riot.thrift.RiotThriftException) TException(org.apache.thrift.TException) IOException(java.io.IOException) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) AtomicLong(java.util.concurrent.atomic.AtomicLong) ProgressStreamRDF(org.apache.jena.system.progress.ProgressStreamRDF) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree)

Example 3 with FileSet

use of org.apache.jena.dboe.base.file.FileSet in project jena by apache.

the class ProcBuildIndexX method indexBuilder.

private static long indexBuilder(DatasetGraph dsg, InputStream input, String indexName) {
    long tickPoint = BulkLoaderX.DataTick;
    int superTick = BulkLoaderX.DataSuperTick;
    // Location of storage, not the DB.
    DatasetGraphTDB dsgtdb = TDBInternal.getDatasetGraphTDB(dsg);
    Location location = dsgtdb.getLocation();
    int keyLength = SystemTDB.SizeOfNodeId * indexName.length();
    int valueLength = 0;
    // The name is the order.
    String primary = indexName;
    String primaryOrder;
    int dftKeyLength;
    int dftValueLength;
    int tupleLength = indexName.length();
    TupleIndex index;
    if (tupleLength == 3) {
        primaryOrder = Names.primaryIndexTriples;
        dftKeyLength = SystemTDB.LenIndexTripleRecord;
        dftValueLength = 0;
        // Find index.
        index = findIndex(dsgtdb.getTripleTable().getNodeTupleTable().getTupleTable().getIndexes(), indexName);
    } else if (tupleLength == 4) {
        primaryOrder = Names.primaryIndexQuads;
        dftKeyLength = SystemTDB.LenIndexQuadRecord;
        dftValueLength = 0;
        index = findIndex(dsgtdb.getQuadTable().getNodeTupleTable().getTupleTable().getIndexes(), indexName);
    } else {
        throw new TDBException("Index name: " + indexName);
    }
    TupleMap colMap = TupleMap.create(primaryOrder, indexName);
    int readCacheSize = 10;
    int writeCacheSize = 100;
    int blockSize = SystemTDB.BlockSize;
    RecordFactory recordFactory = new RecordFactory(dftKeyLength, dftValueLength);
    int order = BPlusTreeParams.calcOrder(blockSize, recordFactory);
    BPlusTreeParams bptParams = new BPlusTreeParams(order, recordFactory);
    int blockSizeNodes = blockSize;
    int blockSizeRecords = blockSize;
    FileSet destination = new FileSet(location, indexName);
    BufferChannel blkState = FileFactory.createBufferChannel(destination, Names.extBptState);
    BlockMgr blkMgrNodes = BlockMgrFactory.create(destination, Names.extBptTree, blockSizeNodes, readCacheSize, writeCacheSize);
    BlockMgr blkMgrRecords = BlockMgrFactory.create(destination, Names.extBptRecords, blockSizeRecords, readCacheSize, writeCacheSize);
    int rowBlock = 1000;
    Iterator<Record> iter = new RecordsFromInput(input, tupleLength, colMap, rowBlock);
    // ProgressMonitor.
    ProgressMonitor monitor = ProgressMonitorOutput.create(BulkLoaderX.LOG_Index, indexName, tickPoint, superTick);
    ProgressIterator<Record> iter2 = new ProgressIterator<>(iter, monitor);
    monitor.start();
    BPlusTree bpt2 = BPlusTreeRewriter.packIntoBPlusTree(iter2, bptParams, recordFactory, blkState, blkMgrNodes, blkMgrRecords);
    bpt2.close();
    monitor.finish();
    // [BULK] End stage.
    long count = monitor.getTicks();
    return count;
}
Also used : BPlusTreeParams(org.apache.jena.dboe.trans.bplustree.BPlusTreeParams) FileSet(org.apache.jena.dboe.base.file.FileSet) TDBException(org.apache.jena.tdb2.TDBException) BufferChannel(org.apache.jena.dboe.base.file.BufferChannel) DatasetGraphTDB(org.apache.jena.tdb2.store.DatasetGraphTDB) TupleMap(org.apache.jena.atlas.lib.tuple.TupleMap) ProgressIterator(org.apache.jena.system.progress.ProgressIterator) ProgressMonitor(org.apache.jena.system.progress.ProgressMonitor) RecordFactory(org.apache.jena.dboe.base.record.RecordFactory) BlockMgr(org.apache.jena.dboe.base.block.BlockMgr) Record(org.apache.jena.dboe.base.record.Record) TupleIndex(org.apache.jena.tdb2.store.tupletable.TupleIndex) BPlusTree(org.apache.jena.dboe.trans.bplustree.BPlusTree) Location(org.apache.jena.dboe.base.file.Location)

Aggregations

BufferChannel (org.apache.jena.dboe.base.file.BufferChannel)3 FileSet (org.apache.jena.dboe.base.file.FileSet)3 Record (org.apache.jena.dboe.base.record.Record)3 BPlusTree (org.apache.jena.dboe.trans.bplustree.BPlusTree)3 BPlusTreeParams (org.apache.jena.dboe.trans.bplustree.BPlusTreeParams)3 BlockMgr (org.apache.jena.dboe.base.block.BlockMgr)2 RecordFactory (org.apache.jena.dboe.base.record.RecordFactory)2 ProgressIterator (org.apache.jena.system.progress.ProgressIterator)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1 List (java.util.List)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 IO (org.apache.jena.atlas.io.IO)1 IteratorSlotted (org.apache.jena.atlas.iterator.IteratorSlotted)1 org.apache.jena.atlas.lib (org.apache.jena.atlas.lib)1