Search in sources :

Example 1 with ValidatedWikitext

use of org.sweble.wikitext.parser.encval.ValidatedWikitext in project OpenRefine by OpenRefine.

the class WikitextImporter method parseOneFile.

@Override
public void parseOneFile(Project project, ProjectMetadata metadata, ImportingJob job, String fileSource, Reader reader, int limit, ObjectNode options, List<Exception> exceptions) {
    // Set-up a simple wiki configuration
    ParserConfig parserConfig = new SimpleParserConfig();
    try {
        // Encoding validation
        WikitextEncodingValidator v = new WikitextEncodingValidator();
        String wikitext = CharStreams.toString(reader);
        String title = "Page title";
        ValidatedWikitext validated = v.validate(parserConfig, wikitext, title);
        // Pre-processing
        WikitextPreprocessor prep = new WikitextPreprocessor(parserConfig);
        WtPreproWikitextPage prepArticle = (WtPreproWikitextPage) prep.parseArticle(validated, title, false);
        // Parsing
        PreprocessedWikitext ppw = PreprocessorToParserTransformer.transform(prepArticle);
        WikitextParser parser = new WikitextParser(parserConfig);
        WtParsedWikitextPage parsedArticle;
        parsedArticle = (WtParsedWikitextPage) parser.parseArticle(ppw, title);
        // Compile the retrieved page
        boolean blankSpanningCells = JSONUtilities.getBoolean(options, "blankSpanningCells", true);
        boolean includeRawTemplates = JSONUtilities.getBoolean(options, "includeRawTemplates", false);
        boolean parseReferences = JSONUtilities.getBoolean(options, "parseReferences", true);
        final WikitextTableVisitor vs = new WikitextTableVisitor(blankSpanningCells, includeRawTemplates);
        vs.go(parsedArticle);
        WikiTableDataReader dataReader = new WikiTableDataReader(vs, parseReferences);
        // Reconcile if needed
        String wikiUrl = JSONUtilities.getString(options, "wikiUrl", null);
        // Wikidata reconciliation endpoint, hardcoded because the user might not have it in its services
        String reconUrl = JSONUtilities.getString(options, "reconService", "https://wikidata.reconci.link/en/api");
        StandardReconConfig cfg = getReconConfig(reconUrl);
        if (wikiUrl != null) {
            dataReader.reconcileToQids(wikiUrl, cfg);
        }
        // Set metadata
        if (vs.caption != null && vs.caption.length() > 0) {
            metadata.setName(vs.caption);
        // TODO this does not seem to do anything - maybe we need to pass it to OpenRefine in some other way?
        }
        TabularImportingParserBase.readTable(project, job, dataReader, limit, options, exceptions);
        // Add reconciliation statistics
        if (dataReader.columnReconciled != null) {
            for (int i = 0; i != dataReader.columnReconciled.size(); i++) {
                if (dataReader.columnReconciled.get(i)) {
                    Column col = project.columnModel.columns.get(i);
                    col.setReconStats(ReconStats.create(project, i));
                    col.setReconConfig(cfg);
                }
            }
        }
    } catch (IOException e1) {
        e1.printStackTrace();
    } catch (ParseException e1) {
        exceptions.add(e1);
        e1.printStackTrace();
    }
}
Also used : WtParsedWikitextPage(org.sweble.wikitext.parser.nodes.WtParsedWikitextPage) WikitextEncodingValidator(org.sweble.wikitext.parser.WikitextEncodingValidator) SimpleParserConfig(org.sweble.wikitext.parser.utils.SimpleParserConfig) IOException(java.io.IOException) PreprocessedWikitext(org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext) ValidatedWikitext(org.sweble.wikitext.parser.encval.ValidatedWikitext) StandardReconConfig(com.google.refine.model.recon.StandardReconConfig) Column(com.google.refine.model.Column) WikitextPreprocessor(org.sweble.wikitext.parser.WikitextPreprocessor) WtPreproWikitextPage(org.sweble.wikitext.parser.nodes.WtPreproWikitextPage) ParseException(xtc.parser.ParseException) WikitextParser(org.sweble.wikitext.parser.WikitextParser) ParserConfig(org.sweble.wikitext.parser.ParserConfig) SimpleParserConfig(org.sweble.wikitext.parser.utils.SimpleParserConfig)

Aggregations

Column (com.google.refine.model.Column)1 StandardReconConfig (com.google.refine.model.recon.StandardReconConfig)1 IOException (java.io.IOException)1 ParserConfig (org.sweble.wikitext.parser.ParserConfig)1 WikitextEncodingValidator (org.sweble.wikitext.parser.WikitextEncodingValidator)1 WikitextParser (org.sweble.wikitext.parser.WikitextParser)1 WikitextPreprocessor (org.sweble.wikitext.parser.WikitextPreprocessor)1 ValidatedWikitext (org.sweble.wikitext.parser.encval.ValidatedWikitext)1 WtParsedWikitextPage (org.sweble.wikitext.parser.nodes.WtParsedWikitextPage)1 WtPreproWikitextPage (org.sweble.wikitext.parser.nodes.WtPreproWikitextPage)1 PreprocessedWikitext (org.sweble.wikitext.parser.preprocessor.PreprocessedWikitext)1 SimpleParserConfig (org.sweble.wikitext.parser.utils.SimpleParserConfig)1 ParseException (xtc.parser.ParseException)1