Search in sources :

Example 1 with Munger

use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.

the class Update method createUpdater.

/**
 * Create an @{link Updater}.
 *
 * @param options
 * @param wikibaseRepository
 * @param uris
 * @param rdfRepository
 * @param changeSource
 * @return a newly created updater
 */
private static Updater<? extends Change.Batch> createUpdater(UpdateOptions options, WikibaseRepository wikibaseRepository, WikibaseUris uris, RdfRepository rdfRepository, Change.Source<? extends Change.Batch> changeSource) {
    int threads = options.threadCount();
    ThreadFactoryBuilder threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("update %s");
    ExecutorService executor = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), threadFactory.build());
    Munger munger = mungerFromOptions(options);
    if (options.testMode()) {
        return new TestUpdater<>(changeSource, wikibaseRepository, rdfRepository, munger, executor, options.pollDelay(), uris, options.verify());
    }
    return new Updater<>(changeSource, wikibaseRepository, rdfRepository, munger, executor, options.pollDelay(), uris, options.verify());
}
Also used : Munger(org.wikidata.query.rdf.tool.rdf.Munger) ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor)

Example 2 with Munger

use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.

the class UpdaterUnitTest method testUpdateLeftOffTime.

@Test
public void testUpdateLeftOffTime() {
    Instant leftOffInstant1 = Instant.ofEpochMilli(25);
    Instant leftOffInstant2 = Instant.ofEpochSecond(40);
    ImmutableList<Change> changes = ImmutableList.of(new Change("Q2", 1, Instant.ofEpochSecond(10), 2), new Change("Q3", 2, Instant.ofEpochMilli(20), 3));
    TestChange batch1 = new TestChange(changes, 20, leftOffInstant1, false);
    changes = ImmutableList.of(new Change("Q2", 1, Instant.ofEpochSecond(30), 4), new Change("Q3", 2, Instant.ofEpochMilli(40), 5));
    TestChange batch2 = new TestChange(changes, 20, leftOffInstant2, true);
    TestChangeSource source = new TestChangeSource(Arrays.asList(batch1, batch2));
    WikibaseRepository wbRepo = mock(WikibaseRepository.class);
    RdfRepository rdfRepo = mock(RdfRepository.class);
    CollectedUpdateMetrics mutationCountOnlyMetrics = CollectedUpdateMetrics.getMutationCountOnlyMetrics(0);
    when(rdfRepo.syncFromChanges(anyCollectionOf(Change.class), anyBoolean())).thenReturn(mutationCountOnlyMetrics);
    Munger munger = Munger.builder(UrisSchemeFactory.WIKIDATA).build();
    ExecutorService executorService = Executors.newFixedThreadPool(2, (r) -> new Thread(r, "Thread-" + this.getClass().getSimpleName()));
    MetricRegistry metricRegistry = new MetricRegistry();
    Updater<TestChange> updater = new Updater<>(source, wbRepo, rdfRepo, munger, executorService, true, 100, UrisSchemeFactory.WIKIDATA, false, metricRegistry);
    updater.run();
    verify(rdfRepo, times(2)).updateLeftOffTime(lestOffDateCaptor.capture());
    assertThat(lestOffDateCaptor.getAllValues()).containsExactly(leftOffInstant1.minusSeconds(1), leftOffInstant2.minusSeconds(1));
    assertThat(source.isBatchMarkedDone(batch1)).isTrue();
    assertThat(source.isBatchMarkedDone(batch2)).isTrue();
}
Also used : CollectedUpdateMetrics(org.wikidata.query.rdf.tool.rdf.CollectedUpdateMetrics) Instant(java.time.Instant) Munger(org.wikidata.query.rdf.tool.rdf.Munger) MetricRegistry(com.codahale.metrics.MetricRegistry) WikibaseRepository(org.wikidata.query.rdf.tool.wikibase.WikibaseRepository) RdfRepository(org.wikidata.query.rdf.tool.rdf.RdfRepository) Change(org.wikidata.query.rdf.tool.change.Change) ExecutorService(java.util.concurrent.ExecutorService) Test(org.junit.Test)

Example 3 with Munger

use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.

the class MungeIntegrationTest method loadDumpIntoRepo.

private void loadDumpIntoRepo(Reader dumpReader, int count) throws IOException, InterruptedException, RDFParseException, RDFHandlerException {
    Munger munger = Munger.builder(uris).singleLabelMode("en").build();
    File file = File.createTempFile("munge-test", ".ttl");
    String fileURL = file.toURI().toURL().toString();
    Munge munge = new Munge(uris, munger, dumpReader, Integer.MAX_VALUE, file.getAbsolutePath());
    munge.run();
    assertEquals(count, (long) rdfRepository.getClient().loadUrl(fileURL));
}
Also used : Munger(org.wikidata.query.rdf.tool.rdf.Munger) File(java.io.File)

Example 4 with Munger

use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.

the class Munge method main.

/**
 * Run a bulk munge configured from the command line.
 */
@SuppressWarnings("IllegalCatch")
public static void main(String[] args) {
    MungeOptions options = handleOptions(MungeOptions.class, args);
    UrisScheme uris = OptionsUtils.WikibaseOptions.wikibaseUris(options);
    Munger munger = mungerFromOptions(options);
    int chunksize = options.chunkSize();
    if (chunksize < 1) {
        chunksize = Integer.MAX_VALUE;
    }
    try {
        Munge munge = new Munge(uris, munger, CliUtils.reader(options.from()), chunksize, options.to());
        munge.run();
    } catch (Exception e) {
        log.error("Fatal error munging RDF", e);
        System.exit(1);
    }
}
Also used : UrisScheme(org.wikidata.query.rdf.common.uri.UrisScheme) Munger(org.wikidata.query.rdf.tool.rdf.Munger) MungeOptions(org.wikidata.query.rdf.tool.options.MungeOptions) RDFHandlerException(org.openrdf.rio.RDFHandlerException) IOException(java.io.IOException) RDFParseException(org.openrdf.rio.RDFParseException)

Example 5 with Munger

use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.

the class Munge method run.

public void run() throws RDFHandlerException, IOException, RDFParseException, InterruptedException {
    try {
        AsyncRDFHandler chunkWriter = AsyncRDFHandler.processAsync(new RDFChunkWriter(chunkFileFormat), false, BUFFER_SIZE);
        AtomicLong actualChunk = new AtomicLong(0);
        EntityMungingRdfHandler.EntityCountListener chunker = (entities) -> {
            long currentChunk = entities / chunkSize;
            if (currentChunk != actualChunk.get()) {
                actualChunk.set(currentChunk);
                // endRDF will cause RDFChunkWriter to start writing a new chunk
                chunkWriter.endRDF();
            }
        };
        EntityMungingRdfHandler munger = new EntityMungingRdfHandler(uris, this.munger, chunkWriter, chunker);
        RDFParser parser = RDFParserSuppliers.defaultRdfParser().get(AsyncRDFHandler.processAsync(new NormalizingRdfHandler(munger), true, BUFFER_SIZE));
        parser.parse(from, uris.root());
        // thread:main: parser -> AsyncRDFHandler -> queue
        // thread:replayer1: Normalizing/Munging -> AsyncRDFHandler -> queue
        // thread:replayer2: RDFChunkWriter -> RDFWriter -> IO
        chunkWriter.waitForCompletion();
    } finally {
        try {
            from.close();
        } catch (IOException e) {
            log.error("Error closing input", e);
        }
    }
}
Also used : Statement(org.openrdf.model.Statement) Munger(org.wikidata.query.rdf.tool.rdf.Munger) LoggerFactory(org.slf4j.LoggerFactory) NormalizingRdfHandler(org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler) LinkedHashMap(java.util.LinkedHashMap) RDFFormat(org.openrdf.rio.RDFFormat) Locale(java.util.Locale) Map(java.util.Map) MungeOptions(org.wikidata.query.rdf.tool.options.MungeOptions) BasicWriterSettings(org.openrdf.rio.helpers.BasicWriterSettings) AsyncRDFHandler(org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler) OptionsUtils.mungerFromOptions(org.wikidata.query.rdf.tool.options.OptionsUtils.mungerFromOptions) FALSE(java.lang.Boolean.FALSE) Logger(org.slf4j.Logger) RDFHandlerException(org.openrdf.rio.RDFHandlerException) OptionsUtils(org.wikidata.query.rdf.tool.options.OptionsUtils) RDFParserSuppliers(org.wikidata.query.rdf.tool.rdf.RDFParserSuppliers) WriterConfig(org.openrdf.rio.WriterConfig) IOException(java.io.IOException) Rio(org.openrdf.rio.Rio) Reader(java.io.Reader) PrefixRecordingRdfHandler(org.wikidata.query.rdf.tool.rdf.PrefixRecordingRdfHandler) AtomicLong(java.util.concurrent.atomic.AtomicLong) RDFParser(org.openrdf.rio.RDFParser) OptionsUtils.handleOptions(org.wikidata.query.rdf.tool.options.OptionsUtils.handleOptions) RDFParseException(org.openrdf.rio.RDFParseException) UrisScheme(org.wikidata.query.rdf.common.uri.UrisScheme) Writer(java.io.Writer) EntityMungingRdfHandler(org.wikidata.query.rdf.tool.rdf.EntityMungingRdfHandler) RDFHandler(org.openrdf.rio.RDFHandler) RDFWriter(org.openrdf.rio.RDFWriter) AtomicLong(java.util.concurrent.atomic.AtomicLong) EntityMungingRdfHandler(org.wikidata.query.rdf.tool.rdf.EntityMungingRdfHandler) AsyncRDFHandler(org.wikidata.query.rdf.tool.rdf.AsyncRDFHandler) IOException(java.io.IOException) RDFParser(org.openrdf.rio.RDFParser) NormalizingRdfHandler(org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler)

Aggregations

Munger (org.wikidata.query.rdf.tool.rdf.Munger)6 IOException (java.io.IOException)3 ExecutorService (java.util.concurrent.ExecutorService)3 UrisScheme (org.wikidata.query.rdf.common.uri.UrisScheme)3 MetricRegistry (com.codahale.metrics.MetricRegistry)2 Instant (java.time.Instant)2 RDFHandlerException (org.openrdf.rio.RDFHandlerException)2 RDFParseException (org.openrdf.rio.RDFParseException)2 Change (org.wikidata.query.rdf.tool.change.Change)2 MungeOptions (org.wikidata.query.rdf.tool.options.MungeOptions)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 File (java.io.File)1 Reader (java.io.Reader)1 Writer (java.io.Writer)1 FALSE (java.lang.Boolean.FALSE)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 Duration (java.time.Duration)1 LinkedHashMap (java.util.LinkedHashMap)1 Locale (java.util.Locale)1