use of org.sweble.wikitext.parser.WikitextEncodingValidator in project OpenRefine by OpenRefine.
the class WikitextImporter method parseOneFile.
@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, Reader reader, int limit, ObjectNode options, List<Exception> exceptions) {
// Set-up a simple wiki configuration
ParserConfig parserConfig = new SimpleParserConfig();
try {
// Encoding validation
WikitextEncodingValidator v = new WikitextEncodingValidator();
String wikitext = CharStreams.toString(reader);
String title = "Page title";
ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
// Pre-processing
WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
WtPreproWikitextPage prepArticle = (WtPreproWikitextPage) prep.parseArticle(validated, title, false);
// Parsing
PreprocessedWikitext ppw = PreprocessorToParserTransformer.transform(prepArticle);
WikitextParser parser = new WikitextParser(parserConfig);
WtParsedWikitextPage parsedArticle;
parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title);
// Compile the retrieved page
boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true);
final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
vs.go(parsedArticle);
WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences);
// Reconcile if needed
String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
// Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
String reconUrl = JSONUtilities.getString(options, "reconService", "https://wikidata.reconci.link/en/api");
StandardReconConfig cfg = getReconConfig(reconUrl);
if (wikiUrl != null) {
dataReader.reconcileToQids(wikiUrl, cfg);
}
// Set metadata
if (vs.caption != null && vs.caption.length() > 0) {
metadata.setName(vs.caption);
// TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
}
TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
// Add reconciliation statistics
if (dataReader.columnReconciled != null) {
for (int i = 0; i != dataReader.columnReconciled.size(); i++) {
if (dataReader.columnReconciled.get(i)) {
Column col = project.columnModel.columns.get(i);
col.setReconStats(ReconStats.create(project, i));
col.setReconConfig(cfg);
}
}
}
} catch (IOException e1) {
e1.printStackTrace();
} catch (ParseException e1) {
exceptions.add(e1);
e1.printStackTrace();
}
}
Aggregations