use of org.opensextant.xtext.XText in project Xponents by OpenSextant.
the class TestTikaPST method main.
/** Compare Tika's PST conversion to XText non-Tika PST conversion.
*/
public static void main(String[] args) {
// Path to a PST. NOTE, java-libpst provides some good test data.
String input = args[0];
try {
XText xt = new XText();
xt.enableSaving(true);
xt.enableTikaPST(true);
// creates a ./text/ Folder locally in directory.
xt.getPathManager().enableSaveWithInput(false);
xt.clearSettings();
xt.convertFileType("pst");
xt.getPathManager().setConversionCache("./xtext-testing");
xt.setup();
xt.extractText(input);
} catch (IOException ioerr) {
ioerr.printStackTrace();
System.err.println("IO issue" + ioerr.getMessage());
} catch (ConfigException cfgerr) {
cfgerr.printStackTrace();
System.err.println("Config issue" + cfgerr.getMessage());
}
}
use of org.opensextant.xtext.XText in project Xponents by OpenSextant.
the class VideoTests method run.
public void run(String input) throws IOException {
XText xt = new XText();
xt.getPathManager().enableSaveWithInput(false);
xt.getPathManager().setConversionCache("video-output");
xt.enableSaving(true);
xt.convertFileType("mp4");
xt.convertFileType("mpeg");
xt.convertFileType("mpg");
xt.convertFileType("avi");
xt.convertFileType("wmv");
xt.setMaxFileSize(0x8000000);
xt.setup();
xt.setConversionListener(this);
xt.extractText(input);
}
use of org.opensextant.xtext.XText in project Xponents by OpenSextant.
the class BasicGeoTemporalProcessing method setup.
/** Ideally you should separate your one-time initialization steps, configuring your extractors
* apart from the repetitive steps of setting up Jobs and Inputs. Outputs you might setup once
* for the entire JVM session, or it may be something you do periodically. In summary:
*
* configure separately:
* a) extractors, converters
* b) job inputs and parameters
* c) output formatters
* d) other resources, e.g., filters
*/
public void setup(String inFile, List<String> outFormats, String outFile, String tempDir) throws ConfigException, ProcessingException, IOException {
params.isdefault = false;
if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
}
// If you are dead-sure you want only coordinates from text, then just use XCoord.
// Otherwise SimpleGeocoder does both coords + names.
//
//XCoord xcoord = new XCoord();
//xcoord.configure();
//this.addExtractor(xcoord);
// Testing only
params.tag_places = true;
params.tag_coordinates = true;
params.output_countries = false;
PlaceGeocoder geocoder = new PlaceGeocoder();
geocoder.enablePersonNameMatching(true);
geocoder.setParameters(params);
geocoder.configure();
this.addExtractor(geocoder);
XTemporal xtemp = new XTemporal();
xtemp.configure();
this.addExtractor(xtemp);
converter = new XText();
converter.enableHTMLScrubber(false);
converter.enableSaving(true);
converter.enableOverwrite(false);
converter.setConversionListener(this);
//
if (tempDir != null) {
converter.getPathManager().setConversionCache(tempDir);
} else {
converter.enableSaving(false);
}
try {
converter.setup();
} catch (IOException ioerr) {
throw new ConfigException("Document converter could not start", ioerr);
}
this.params.inputFile = inFile.trim();
this.params.outputFile = outFile.trim();
if (outFormats != null) {
for (String fmt : outFormats) {
params.addOutputFormat(fmt);
AbstractFormatter formatter = createFormatter(fmt, params);
formatter.overwrite = overwriteOutput;
this.addFormatter(formatter);
//if (formatter instanceof CSVFormatter) {
// formatter.addField(OpenSextantSchema.FILEPATH.getName());
// formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
// }
formatter.start(params.getJobName());
}
}
}
use of org.opensextant.xtext.XText in project Xponents by OpenSextant.
the class WebCrawl method main.
public static void main(String[] args) {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("WebCrawl", args, "do:l:f:");
String o = null;
String webSite = null;
boolean currentDirOnly = false;
String inputFile = null;
try {
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 'l':
webSite = opts.getOptarg();
break;
case 'f':
inputFile = opts.getOptarg();
break;
case 'o':
o = opts.getOptarg();
break;
case 'd':
currentDirOnly = true;
break;
default:
usage();
System.exit(-1);
}
}
} catch (Exception execErr) {
usage();
execErr.printStackTrace();
System.exit(-1);
}
WebCrawl me = new WebCrawl();
try {
FileUtility.makeDirectory(o);
XText conv = new XText();
/*
* Following setup for conversion and crawl says this:
*
* - Save converted data at cacheDir
* - The inputDir == cacheDir, so conversions are "saved with input"
*
* This is because the crawler below is crawling and downloading HTML pages to the "inputDir"
* For simplicity sake, the pages are being converted and cached in that same hierarchical folder.
*
* Alternatively, crawler can save content to A, convert and cache in B., we A != B and B is not inside A, etc.
* /path/to/A
* /path/to/B
* for example.
*/
File cacheDir = new File(o);
conv.enableSaving(true);
// Order of setting is important. Since cacheDir == input & saving with input,
// Then no need to set a separate cacheDir.
//conv.getPathManager().setConversionCache(cacheDir.getAbsolutePath());
conv.getPathManager().setInputRoot(cacheDir);
conv.getPathManager().enableSaveWithInput(true);
conv.setup();
DefaultWebCrawl crawl = new DefaultWebCrawl(webSite, o);
crawl.setAllowCurrentDirOnly(currentDirOnly);
String proxyHost = System.getProperty("http.proxyHost");
if (proxyHost != null) {
crawl.setProxy(proxyHost + ":80");
}
crawl.configure();
// Set these items
crawl.setConverter(conv);
crawl.setListener(me);
// Go do it.
if (inputFile != null) {
File f = new File(inputFile);
if (f.exists()) {
// parse links from file as if the file was pulled from website -l link.
crawl.collect(f);
} else {
System.err.println("File does not exist F=" + inputFile);
}
} else {
crawl.collect();
}
} catch (Exception err) {
err.printStackTrace();
}
}
use of org.opensextant.xtext.XText in project Xponents by OpenSextant.
the class ImageGroper method main.
public static void main(String[] args) {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("ImageGroper", args, "hei:o:");
String input = null;
String output = null;
boolean embed = false;
try {
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 'i':
input = opts.getOptarg();
break;
case 'o':
output = opts.getOptarg();
break;
case 'e':
embed = true;
System.out.println("Saving conversions to Input folder. Output folder will be ignored.");
break;
default:
ImageGroper.usage();
System.exit(1);
}
}
} catch (Exception err) {
ImageGroper.usage();
System.exit(1);
}
// Setting LANG=en_US in your shell.
//
// System.setProperty("LANG", "en_US");
XText xt = new XText();
xt.enableSaving(true);
// creates a ./text/ Folder locally in directory.
xt.getPathManager().enableSaveWithInput(embed);
xt.clearSettings();
xt.convertFileType("jpg");
xt.convertFileType("jpeg");
try {
xt.getPathManager().setConversionCache(output);
xt.setup();
xt.extractText(input);
} catch (IOException ioerr) {
ioerr.printStackTrace();
} catch (ConfigException cfgerr) {
cfgerr.printStackTrace();
}
}
Aggregations