use of org.sweble.wikitext.parser.WikitextPreprocessor in project wikivoyage-listings by baturin.
the class ArticleParser method parsePage.
/**
* Parse single Wikivoyage page, look for listings, put them into list of POIs
* @param article Name of Wikivoyage article
* @param text Wikivoyage page as string
*/
public List<Listing> parsePage(String article, String text) {
log.debug("Start: parse article '" + article + "'");
LinkedList<Listing> pois = new LinkedList<>();
try {
ParserConfig config = new SimpleParserConfig();
WikitextPreprocessor p = new WikitextPreprocessor(config);
WtNode node = p.parseArticle(text, "");
processNode(article, node, pois);
} catch (Exception e) {
System.err.println("Failure");
e.printStackTrace();
}
log.debug("End: parse article '" + article + "'");
return pois;
}
use of org.sweble.wikitext.parser.WikitextPreprocessor in project OpenRefine by OpenRefine.
the class WikitextImporter method parseOneFile.
@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, Reader reader, int limit, ObjectNode options, List<Exception> exceptions) {
// Set-up a simple wiki configuration
ParserConfig parserConfig = new SimpleParserConfig();
try {
// Encoding validation
WikitextEncodingValidator v = new WikitextEncodingValidator();
String wikitext = CharStreams.toString(reader);
String title = "Page title";
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
// Pre-processing
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
WtPreproWikitextPage prepArticle = (WtPreproWikitextPage) prep.parseArticle(validated, title, false);
// Parsing
PreprocessedWikitext ppw = PreprocessorToParserTransformer.transform(prepArticle);
WikitextParser parser = new WikitextParser(parserConfig);
WtParsedWikitextPage parsedArticle;
parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title);
// Compile the retrieved page
boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true);
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
vs.go(parsedArticle);
WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences);
// Reconcile if needed
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
// Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
String reconUrl = JSONUtilities.getString(options, "reconService", "https://wikidata.reconci.link/en/api");
StandardReconConfig cfg = getReconConfig(reconUrl);
if (wikiUrl != null) {
dataReader.reconcileToQids(wikiUrl, cfg);
}
// Set metadata
if (vs.caption != null && vs.caption.length() > 0) {
metadata.setName(vs.caption);
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
}
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {
for (int i = 0; i != dataReader.columnReconciled.size(); i++) {
if (dataReader.columnReconciled.get(i)) {
Column col = project.columnModel.columns.get(i);
col.setReconStats(ReconStats.create(project, i));
col.setReconConfig(cfg);
}
}
}
} catch (IOException e1) {
e1.printStackTrace();
} catch (ParseException e1) {
exceptions.add(e1);
e1.printStackTrace();
}
}
Aggregations