Search in sources :

Example 1 with XText

use of org.opensextant.xtext.XText in project Xponents by OpenSextant.

the class TestTikaPST method main.

/** Compare Tika's PST conversion to XText non-Tika PST conversion.
     */
public static void main(String[] args) {
    // Path to a PST.  NOTE, java-libpst provides some good test data.
    String input = args[0];
    try {
        XText xt = new XText();
        xt.enableSaving(true);
        xt.enableTikaPST(true);
        // creates a ./text/ Folder locally in directory.
        xt.getPathManager().enableSaveWithInput(false);
        xt.clearSettings();
        xt.convertFileType("pst");
        xt.getPathManager().setConversionCache("./xtext-testing");
        xt.setup();
        xt.extractText(input);
    } catch (IOException ioerr) {
        ioerr.printStackTrace();
        System.err.println("IO issue" + ioerr.getMessage());
    } catch (ConfigException cfgerr) {
        cfgerr.printStackTrace();
        System.err.println("Config issue" + cfgerr.getMessage());
    }
}
Also used : XText(org.opensextant.xtext.XText) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 2 with XText

use of org.opensextant.xtext.XText in project Xponents by OpenSextant.

the class VideoTests method run.

public void run(String input) throws IOException {
    XText xt = new XText();
    xt.getPathManager().enableSaveWithInput(false);
    xt.getPathManager().setConversionCache("video-output");
    xt.enableSaving(true);
    xt.convertFileType("mp4");
    xt.convertFileType("mpeg");
    xt.convertFileType("mpg");
    xt.convertFileType("avi");
    xt.convertFileType("wmv");
    xt.setMaxFileSize(0x8000000);
    xt.setup();
    xt.setConversionListener(this);
    xt.extractText(input);
}
Also used : XText(org.opensextant.xtext.XText)

Example 3 with XText

use of org.opensextant.xtext.XText in project Xponents by OpenSextant.

the class BasicGeoTemporalProcessing method setup.

/**   Ideally you should separate your one-time initialization steps, configuring your extractors
     * apart from the repetitive steps of setting up Jobs and Inputs.   Outputs you might setup once
     * for the entire JVM session, or it may be something you do periodically.  In summary:
     *
     * configure separately:
     *   a) extractors, converters
     *   b) job inputs and parameters
     *   c) output formatters
     *   d) other resources, e.g., filters
     */
public void setup(String inFile, List<String> outFormats, String outFile, String tempDir) throws ConfigException, ProcessingException, IOException {
    params.isdefault = false;
    if (!validateParameters(inFile, outFormats, outFile, tempDir, params)) {
        throw new ProcessingException("VALIDATION ERRORS: " + runnerMessage.toString());
    }
    // If you are dead-sure you want only coordinates from text, then just use XCoord.
    // Otherwise SimpleGeocoder does both coords + names.
    // 
    //XCoord xcoord = new XCoord();
    //xcoord.configure();
    //this.addExtractor(xcoord);
    // Testing only
    params.tag_places = true;
    params.tag_coordinates = true;
    params.output_countries = false;
    PlaceGeocoder geocoder = new PlaceGeocoder();
    geocoder.enablePersonNameMatching(true);
    geocoder.setParameters(params);
    geocoder.configure();
    this.addExtractor(geocoder);
    XTemporal xtemp = new XTemporal();
    xtemp.configure();
    this.addExtractor(xtemp);
    converter = new XText();
    converter.enableHTMLScrubber(false);
    converter.enableSaving(true);
    converter.enableOverwrite(false);
    converter.setConversionListener(this);
    // 
    if (tempDir != null) {
        converter.getPathManager().setConversionCache(tempDir);
    } else {
        converter.enableSaving(false);
    }
    try {
        converter.setup();
    } catch (IOException ioerr) {
        throw new ConfigException("Document converter could not start", ioerr);
    }
    this.params.inputFile = inFile.trim();
    this.params.outputFile = outFile.trim();
    if (outFormats != null) {
        for (String fmt : outFormats) {
            params.addOutputFormat(fmt);
            AbstractFormatter formatter = createFormatter(fmt, params);
            formatter.overwrite = overwriteOutput;
            this.addFormatter(formatter);
            //if (formatter instanceof CSVFormatter) {
            //    formatter.addField(OpenSextantSchema.FILEPATH.getName());
            //    formatter.addField(OpenSextantSchema.MATCH_TEXT.getName());
            // }
            formatter.start(params.getJobName());
        }
    }
}
Also used : PlaceGeocoder(org.opensextant.extractors.geo.PlaceGeocoder) XTemporal(org.opensextant.extractors.xtemporal.XTemporal) XText(org.opensextant.xtext.XText) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) AbstractFormatter(org.opensextant.output.AbstractFormatter) ProcessingException(org.opensextant.processing.ProcessingException)

Example 4 with XText

use of org.opensextant.xtext.XText in project Xponents by OpenSextant.

the class WebCrawl method main.

public static void main(String[] args) {
    gnu.getopt.Getopt opts = new gnu.getopt.Getopt("WebCrawl", args, "do:l:f:");
    String o = null;
    String webSite = null;
    boolean currentDirOnly = false;
    String inputFile = null;
    try {
        int c;
        while ((c = opts.getopt()) != -1) {
            switch(c) {
                case 'l':
                    webSite = opts.getOptarg();
                    break;
                case 'f':
                    inputFile = opts.getOptarg();
                    break;
                case 'o':
                    o = opts.getOptarg();
                    break;
                case 'd':
                    currentDirOnly = true;
                    break;
                default:
                    usage();
                    System.exit(-1);
            }
        }
    } catch (Exception execErr) {
        usage();
        execErr.printStackTrace();
        System.exit(-1);
    }
    WebCrawl me = new WebCrawl();
    try {
        FileUtility.makeDirectory(o);
        XText conv = new XText();
        /*
             * Following setup for conversion and crawl says this:
             * 
             *  - Save converted data at cacheDir
             *  - The inputDir == cacheDir, so  conversions are "saved with input"
             *  
             *  This is because the crawler below is crawling and downloading HTML pages to the "inputDir"
             *  For simplicity sake, the pages are being converted and cached in that same hierarchical folder.
             *  
             *  Alternatively, crawler can save content to A, convert and cache in B.,  we A != B and B is not inside A, etc.
             *  /path/to/A
             *  /path/to/B
             *  for example.
             */
        File cacheDir = new File(o);
        conv.enableSaving(true);
        // Order of setting is important.  Since cacheDir == input & saving with input, 
        // Then no need to set a separate cacheDir.
        //conv.getPathManager().setConversionCache(cacheDir.getAbsolutePath());
        conv.getPathManager().setInputRoot(cacheDir);
        conv.getPathManager().enableSaveWithInput(true);
        conv.setup();
        DefaultWebCrawl crawl = new DefaultWebCrawl(webSite, o);
        crawl.setAllowCurrentDirOnly(currentDirOnly);
        String proxyHost = System.getProperty("http.proxyHost");
        if (proxyHost != null) {
            crawl.setProxy(proxyHost + ":80");
        }
        crawl.configure();
        // Set these items
        crawl.setConverter(conv);
        crawl.setListener(me);
        // Go do it.
        if (inputFile != null) {
            File f = new File(inputFile);
            if (f.exists()) {
                // parse links from file as if the file was pulled from website -l link. 
                crawl.collect(f);
            } else {
                System.err.println("File does not exist F=" + inputFile);
            }
        } else {
            crawl.collect();
        }
    } catch (Exception err) {
        err.printStackTrace();
    }
}
Also used : DefaultWebCrawl(org.opensextant.xtext.collectors.web.DefaultWebCrawl) XText(org.opensextant.xtext.XText) File(java.io.File) IOException(java.io.IOException) DefaultWebCrawl(org.opensextant.xtext.collectors.web.DefaultWebCrawl)

Example 5 with XText

use of org.opensextant.xtext.XText in project Xponents by OpenSextant.

the class ImageGroper method main.

public static void main(String[] args) {
    gnu.getopt.Getopt opts = new gnu.getopt.Getopt("ImageGroper", args, "hei:o:");
    String input = null;
    String output = null;
    boolean embed = false;
    try {
        int c;
        while ((c = opts.getopt()) != -1) {
            switch(c) {
                case 'i':
                    input = opts.getOptarg();
                    break;
                case 'o':
                    output = opts.getOptarg();
                    break;
                case 'e':
                    embed = true;
                    System.out.println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                default:
                    ImageGroper.usage();
                    System.exit(1);
            }
        }
    } catch (Exception err) {
        ImageGroper.usage();
        System.exit(1);
    }
    // Setting LANG=en_US in your shell.
    //
    // System.setProperty("LANG", "en_US");
    XText xt = new XText();
    xt.enableSaving(true);
    // creates a ./text/ Folder locally in directory.
    xt.getPathManager().enableSaveWithInput(embed);
    xt.clearSettings();
    xt.convertFileType("jpg");
    xt.convertFileType("jpeg");
    try {
        xt.getPathManager().setConversionCache(output);
        xt.setup();
        xt.extractText(input);
    } catch (IOException ioerr) {
        ioerr.printStackTrace();
    } catch (ConfigException cfgerr) {
        cfgerr.printStackTrace();
    }
}
Also used : XText(org.opensextant.xtext.XText) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Aggregations

XText (org.opensextant.xtext.XText)6 IOException (java.io.IOException)5 ConfigException (org.opensextant.ConfigException)3 File (java.io.File)2 PlaceGeocoder (org.opensextant.extractors.geo.PlaceGeocoder)1 XTemporal (org.opensextant.extractors.xtemporal.XTemporal)1 AbstractFormatter (org.opensextant.output.AbstractFormatter)1 ProcessingException (org.opensextant.processing.ProcessingException)1 DefaultSharepointCrawl (org.opensextant.xtext.collectors.sharepoint.DefaultSharepointCrawl)1 DefaultWebCrawl (org.opensextant.xtext.collectors.web.DefaultWebCrawl)1