use of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor in project stanbol by apache.
the class TestHtmlExtractor method testMicrodataExtraction.
/** This test some extraction of microdata from an HTML-5 document
*
* @throws Exception
*/
@Test
public void testMicrodataExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-microdata.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microdata triples: {}", tripleCounter);
printTriples(model);
assertEquals(91, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
use of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor in project stanbol by apache.
the class TestHtmlExtractor method testMFExtraction.
/** This tests some Microformat extraction
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testMFExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-MF.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microformat triples: {}", tripleCounter);
printTriples(model);
assertEquals(127, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
use of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor in project stanbol by apache.
the class TestHtmlExtractor method testRootExtraction.
/** This tests the merging of disconnected graphs under a single root
*
* @throws Exception
*/
@Test
public void testRootExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-MultiRoot.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Triples: {}", tripleCounter);
printTriples(model);
Set<BlankNodeOrIRI> roots = ClerezzaRDFUtils.findRoots(model);
assertTrue(roots.size() > 1);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
roots = ClerezzaRDFUtils.findRoots(model);
assertEquals(1, roots.size());
}
use of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor in project stanbol by apache.
the class TestHtmlExtractor method testRdfaExtraction.
/**
* This tests the RDFa extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testRdfaExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-rdfa.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("RDFa triples: {}", tripleCounter);
printTriples(model);
assertEquals(8, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
use of org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor in project stanbol by apache.
the class HtmlExtractorEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
Graph model = new SimpleGraph();
ci.getLock().readLock().lock();
try {
extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
} finally {
ci.getLock().readLock().unlock();
}
ClerezzaRDFUtils.urifyBlankNodes(model);
// make the model single rooted
if (singleRootRdf) {
ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
LOG.info("Model: {}", model);
ci.getMetadata().addAll(model);
model = null;
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations