use of org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource in project lucene-solr by apache.
the class ExtractWikipedia method main.
public static void main(String[] args) throws Exception {
Path wikipedia = null;
Path outputDir = Paths.get("enwiki");
boolean keepImageOnlyDocs = true;
for (int i = 0; i < args.length; i++) {
String arg = args[i];
if (arg.equals("--input") || arg.equals("-i")) {
wikipedia = Paths.get(args[i + 1]);
i++;
} else if (arg.equals("--output") || arg.equals("-o")) {
outputDir = Paths.get(args[i + 1]);
i++;
} else if (arg.equals("--discardImageOnlyDocs") || arg.equals("-d")) {
keepImageOnlyDocs = false;
}
}
Properties properties = new Properties();
properties.setProperty("docs.file", wikipedia.toAbsolutePath().toString());
properties.setProperty("content.source.forever", "false");
properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs));
Config config = new Config(properties);
ContentSource source = new EnwikiContentSource();
source.setConfig(config);
DocMaker docMaker = new DocMaker();
docMaker.setConfig(config, source);
docMaker.resetInputs();
if (Files.exists(wikipedia)) {
System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiContentSource");
Files.createDirectories(outputDir);
ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir);
extractor.extract();
} else {
printUsage();
}
}
Aggregations