use of org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler in project wikidata-query-rdf by wikimedia.
the class WikibaseRepository method fetchRdfForEntity.
/**
* Fetch the RDF for some entity.
*
* @throws RetryableException thrown if there is an error communicating with
* wikibase
*/
public Collection<Statement> fetchRdfForEntity(String entityId) throws RetryableException {
// TODO handle ?flavor=dump or whatever parameters we need
URI uri = uris.rdf(entityId);
long start = System.currentTimeMillis();
log.debug("Fetching rdf from {}", uri);
RDFParser parser = Rio.createParser(RDFFormat.TURTLE);
StatementCollector collector = new StatementCollector();
parser.setRDFHandler(new NormalizingRdfHandler(collector));
HttpGet request = new HttpGet(uri);
request.setConfig(configWithTimeout);
try {
try (CloseableHttpResponse response = client.execute(request)) {
if (response.getStatusLine().getStatusCode() == 404) {
// A delete/nonexistent page
return Collections.emptyList();
}
if (response.getStatusLine().getStatusCode() >= 300) {
throw new ContainedException("Unexpected status code fetching RDF for " + uri + ": " + response.getStatusLine().getStatusCode());
}
parser.parse(new InputStreamReader(response.getEntity().getContent(), Charsets.UTF_8), uri.toString());
}
} catch (UnknownHostException | SocketException | SSLHandshakeException e) {
// We want to bail on this, since it happens to be sticky for some reason
throw new RuntimeException(e);
} catch (IOException e) {
throw new RetryableException("Error fetching RDF for " + uri, e);
} catch (RDFParseException | RDFHandlerException e) {
throw new ContainedException("RDF parsing error for " + uri, e);
}
log.debug("Done in {} ms", System.currentTimeMillis() - start);
return collector.getStatements();
}
use of org.wikidata.query.rdf.tool.rdf.NormalizingRdfHandler in project wikidata-query-rdf by wikimedia.
the class Munge method run.
public void run() throws RDFHandlerException, IOException, RDFParseException, InterruptedException {
try {
AsyncRDFHandler chunkWriter = AsyncRDFHandler.processAsync(new RDFChunkWriter(chunkFileFormat), false, BUFFER_SIZE);
AtomicLong actualChunk = new AtomicLong(0);
EntityMungingRdfHandler.EntityCountListener chunker = (entities) -> {
long currentChunk = entities / chunkSize;
if (currentChunk != actualChunk.get()) {
actualChunk.set(currentChunk);
// endRDF will cause RDFChunkWriter to start writing a new chunk
chunkWriter.endRDF();
}
};
EntityMungingRdfHandler munger = new EntityMungingRdfHandler(uris, this.munger, chunkWriter, chunker);
RDFParser parser = RDFParserSuppliers.defaultRdfParser().get(AsyncRDFHandler.processAsync(new NormalizingRdfHandler(munger), true, BUFFER_SIZE));
parser.parse(from, uris.root());
// thread:main: parser -> AsyncRDFHandler -> queue
// thread:replayer1: Normalizing/Munging -> AsyncRDFHandler -> queue
// thread:replayer2: RDFChunkWriter -> RDFWriter -> IO
chunkWriter.waitForCompletion();
} finally {
try {
from.close();
} catch (IOException e) {
log.error("Error closing input", e);
}
}
}
Aggregations