use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.
the class Update method createUpdater.
/**
* Create an @{link Updater}.
*
* @param options
* @param wikibaseRepository
* @param uris
* @param rdfRepository
* @param changeSource
* @return a newly created updater
*/
private static Updater<? extends Change.Batch> createUpdater(UpdateOptions options, WikibaseRepository wikibaseRepository, WikibaseUris uris, RdfRepository rdfRepository, Change.Source<? extends Change.Batch> changeSource) {
int threads = options.threadCount();
ThreadFactoryBuilder threadFactory = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("update %s");
ExecutorService executor = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), threadFactory.build());
Munger munger = mungerFromOptions(options);
if (options.testMode()) {
return new TestUpdater<>(changeSource, wikibaseRepository, rdfRepository, munger, executor, options.pollDelay(), uris, options.verify());
}
return new Updater<>(changeSource, wikibaseRepository, rdfRepository, munger, executor, options.pollDelay(), uris, options.verify());
}
use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.
the class UpdaterUnitTest method testUpdateLeftOffTime.
@Test
public void testUpdateLeftOffTime() {
Instant leftOffInstant1 = Instant.ofEpochMilli(25);
Instant leftOffInstant2 = Instant.ofEpochSecond(40);
ImmutableList<Change> changes = ImmutableList.of(new Change("Q2", 1, Instant.ofEpochSecond(10), 2), new Change("Q3", 2, Instant.ofEpochMilli(20), 3));
TestChange batch1 = new TestChange(changes, 20, leftOffInstant1, false);
changes = ImmutableList.of(new Change("Q2", 1, Instant.ofEpochSecond(30), 4), new Change("Q3", 2, Instant.ofEpochMilli(40), 5));
TestChange batch2 = new TestChange(changes, 20, leftOffInstant2, true);
TestChangeSource source = new TestChangeSource(Arrays.asList(batch1, batch2));
WikibaseRepository wbRepo = mock(WikibaseRepository.class);
RdfRepository rdfRepo = mock(RdfRepository.class);
CollectedUpdateMetrics mutationCountOnlyMetrics = CollectedUpdateMetrics.getMutationCountOnlyMetrics(0);
when(rdfRepo.syncFromChanges(anyCollectionOf(Change.class), anyBoolean())).thenReturn(mutationCountOnlyMetrics);
Munger munger = Munger.builder(UrisSchemeFactory.WIKIDATA).build();
ExecutorService executorService = Executors.newFixedThreadPool(2, (r) -> new Thread(r, "Thread-" + this.getClass().getSimpleName()));
MetricRegistry metricRegistry = new MetricRegistry();
Updater<TestChange> updater = new Updater<>(source, wbRepo, rdfRepo, munger, executorService, true, 100, UrisSchemeFactory.WIKIDATA, false, metricRegistry);
updater.run();
verify(rdfRepo, times(2)).updateLeftOffTime(lestOffDateCaptor.capture());
assertThat(lestOffDateCaptor.getAllValues()).containsExactly(leftOffInstant1.minusSeconds(1), leftOffInstant2.minusSeconds(1));
assertThat(source.isBatchMarkedDone(batch1)).isTrue();
assertThat(source.isBatchMarkedDone(batch2)).isTrue();
}
use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.
the class MungeIntegrationTest method loadDumpIntoRepo.
private void loadDumpIntoRepo(Reader dumpReader, int count) throws IOException, InterruptedException, RDFParseException, RDFHandlerException {
Munger munger = Munger.builder(uris).singleLabelMode("en").build();
File file = File.createTempFile("munge-test", ".ttl");
String fileURL = file.toURI().toURL().toString();
Munge munge = new Munge(uris, munger, dumpReader, Integer.MAX_VALUE, file.getAbsolutePath());
munge.run();
assertEquals(count, (long) rdfRepository.getClient().loadUrl(fileURL));
}
use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.
the class Munge method main.
/**
* Run a bulk munge configured from the command line.
*/
@SuppressWarnings("IllegalCatch")
public static void main(String[] args) {
MungeOptions options = handleOptions(MungeOptions.class, args);
UrisScheme uris = OptionsUtils.WikibaseOptions.wikibaseUris(options);
Munger munger = mungerFromOptions(options);
int chunksize = options.chunkSize();
if (chunksize < 1) {
chunksize = Integer.MAX_VALUE;
}
try {
Munge munge = new Munge(uris, munger, CliUtils.reader(options.from()), chunksize, options.to());
munge.run();
} catch (Exception e) {
log.error("Fatal error munging RDF", e);
System.exit(1);
}
}
use of org.wikidata.query.rdf.tool.rdf.Munger in project wikidata-query-rdf by wikimedia.
the class Munge method run.
public void run() throws RDFHandlerException, IOException, RDFParseException, InterruptedException {
try {
AsyncRDFHandler chunkWriter = AsyncRDFHandler.processAsync(new RDFChunkWriter(chunkFileFormat), false, BUFFER_SIZE);
AtomicLong actualChunk = new AtomicLong(0);
EntityMungingRdfHandler.EntityCountListener chunker = (entities) -> {
long currentChunk = entities / chunkSize;
if (currentChunk != actualChunk.get()) {
actualChunk.set(currentChunk);
// endRDF will cause RDFChunkWriter to start writing a new chunk
chunkWriter.endRDF();
}
};
EntityMungingRdfHandler munger = new EntityMungingRdfHandler(uris, this.munger, chunkWriter, chunker);
RDFParser parser = RDFParserSuppliers.defaultRdfParser().get(AsyncRDFHandler.processAsync(new NormalizingRdfHandler(munger), true, BUFFER_SIZE));
parser.parse(from, uris.root());
// thread:main: parser -> AsyncRDFHandler -> queue
// thread:replayer1: Normalizing/Munging -> AsyncRDFHandler -> queue
// thread:replayer2: RDFChunkWriter -> RDFWriter -> IO
chunkWriter.waitForCompletion();
} finally {
try {
from.close();
} catch (IOException e) {
log.error("Error closing input", e);
}
}
}
Aggregations