Search in sources :

Example 1 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class XText method main.

public static void main(String[] args) {
    LongOpt[] options = { new LongOpt("input", LongOpt.REQUIRED_ARGUMENT, null, 'i'), new LongOpt("output", LongOpt.REQUIRED_ARGUMENT, null, 'o'), new LongOpt("export", LongOpt.REQUIRED_ARGUMENT, null, 'x'), new LongOpt("strip-prefix", LongOpt.REQUIRED_ARGUMENT, null, 'p'), new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'), new LongOpt("clean-html", LongOpt.NO_ARGUMENT, null, 'H'), new LongOpt("embed-conversion", LongOpt.NO_ARGUMENT, null, 'e'), new LongOpt("embed-children", LongOpt.NO_ARGUMENT, null, 'c'), new LongOpt("tika-pst", LongOpt.NO_ARGUMENT, null, 'T') };
    // "hcex:i:o:p:"
    gnu.getopt.Getopt opts = new gnu.getopt.Getopt("XText", args, "", options);
    String input = null;
    String output = null;
    boolean embed = false;
    boolean filter_html = false;
    boolean saveChildrenWithInput = false;
    String saveChildrenTo = null;
    String prefix = null;
    XText xt = new XText();
    try {
        int c;
        while ((c = opts.getopt()) != -1) {
            switch(c) {
                case 0:
                    break;
                case 'i':
                    input = opts.getOptarg();
                    break;
                case 'o':
                    output = opts.getOptarg();
                    break;
                case 'H':
                    filter_html = true;
                    break;
                case 'c':
                    saveChildrenWithInput = true;
                    break;
                case 'x':
                    saveChildrenTo = opts.getOptarg();
                    break;
                case 'p':
                    prefix = opts.getOptarg();
                    break;
                case 'e':
                    embed = true;
                    System.out.println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                case 'T':
                    xt.enableTikaPST(true);
                    break;
                case 'h':
                default:
                    XText.usage();
                    System.exit(1);
            }
        }
    } catch (Exception err) {
        XText.usage();
        System.exit(1);
    }
    if (input == null) {
        System.out.println("An input argument is required, e.g., -Dinput=/Folder/...");
        System.exit(-1);
    }
    // Setting LANG=en_US in your shell.
    //
    // System.setProperty("LANG", "en_US");
    // Given this is a test application, we will
    xt.enableOverwrite(true);
    // overwrite every time XText is called.
    xt.enableSaving(embed || output != null);
    // creates a ./text/ Folder locally in
    xt.getPathManager().enableSaveWithInput(embed);
    // directory.
    xt.enableHTMLScrubber(filter_html);
    xt.getPathManager().enableSaveChildrenWithInput(saveChildrenWithInput);
    // If user wishes to strip input paths of some prefix
    // Output will be dumped in the resulting relative path.
    xt.getPathManager().setStripPrefixPath(prefix);
    // ... others?
    if (!saveChildrenWithInput && saveChildrenTo != null) {
        xt.getPathManager().setExtractedChildrenCache(saveChildrenTo);
    }
    try {
        if (!embed) {
            if (output == null) {
                output = "output";
                // Will save to output dir.
                xt.enableSaving(true);
                FileUtility.makeDirectory(output);
                xt.getPathManager().setConversionCache(output);
                System.out.println("Default output folder is $PWD/" + output);
            } else {
                xt.enableSaving(true);
                // Notice this main program requires an output path.
                xt.getPathManager().setConversionCache(output);
            }
        }
        // Set itself to listen, as this is the main program.
        xt.setConversionListener(new MainProgramListener());
        xt.setup();
        xt.extractText(input);
    } catch (IOException ioerr) {
        XText.usage();
        ioerr.printStackTrace();
    } catch (ConfigException cfgerr) {
        XText.usage();
        cfgerr.printStackTrace();
    }
}
Also used : ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) MimeTypeParseException(javax.activation.MimeTypeParseException) LongOpt(gnu.getopt.LongOpt)

Example 2 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class DefaultMailCrawl method collect.

/**
     * TODO:
     * 
     * pull all mail messages,
     * - create reasonable  FILE.msg  file name
     * - use XText to iterate over each msg file for conversion
     * - reimplement
     *
     * @throws IOException on failure to connect or collect.
     */
@Override
public void collect() throws IOException, ConfigException {
    File dateFolder = createDateFolder(new Date());
    if (dateFolder == null) {
        log.error("Unable to create directory: " + dateFolder);
        return;
    }
    Message[] messages = null;
    try {
        connect();
        messages = getMessages();
        if (messages == null) {
            log.info("No messages available - Exiting MailClient now");
            disconnect();
            return;
        }
    } catch (MessagingException javaMailErr) {
        throw new IOException("Unable to connect or get messages", javaMailErr);
    }
    int readCount = 0;
    int totalCount = 0;
    int available = messages.length;
    int errCount = 0;
    //
    for (Message message : messages) {
        ++totalCount;
        // Exit if too many errors.
        if (errCount > 10) {
            break;
        }
        try {
            if (config.doneReading(messages.length, readCount)) {
                // Done here.
                break;
            }
            /**
                 * Silently ignore deleted messages; items deleted while we were
                 * in session
                 */
            if (message.isExpunged()) {
                log.info("Message deleted during session; Unable to collect. Mail Subj: {}", message.getSubject());
                continue;
            }
            boolean newMessage = !message.isSet(Flags.Flag.SEEN);
            log.debug("Message Subject: " + message.getSubject() + "  new?: " + newMessage);
            boolean setForDeleteNow = false;
            String subject = message.getSubject();
            if (message.getSubject() == null) {
                log.info("Empty message title MSG number=" + message.getMessageNumber());
                //continue;
                subject = "No_Subject";
            }
            if ((!config.isReadNewMessagesOnly() || newMessage)) {
                // 1. Identify the email message.
                //    and determine if you need to capture it again.
                //
                String messageFilename = MessageConverter.createSafeFilename(subject);
                if (messageFilename.length() > 60) {
                    messageFilename = messageFilename.substring(0, 60);
                }
                String msgId = MessageConverter.getMessageID(message);
                if (msgId == null) {
                    log.error("How can a message ID be null? SUBJ={}", message.getSubject());
                    continue;
                }
                msgId = MessageConverter.getShorterMessageID(msgId);
                try {
                    if (listener != null && listener.exists(msgId)) {
                        //
                        continue;
                    }
                } catch (Exception err1) {
                    log.error("Collection error with mail.", err1);
                    continue;
                }
                readCount++;
                if (log.isDebugEnabled()) {
                    log.debug("Message: {}", message.getSubject());
                    String msg = String.format("Processing message: %s / %s of available: %s", readCount, totalCount, available);
                    log.debug(msg);
                }
                // Save file in archive, Convert it, etc.
                int status = saveMessageToFile(dateFolder, message, msgId, messageFilename);
                if (status < 0) {
                    ++errCount;
                }
                if (!config.isReadOnly() && config.isDeleteOnRead()) {
                    message.setFlag(Flags.Flag.DELETED, true);
                    String dbg = String.format("Processing message: %d / %d of available:%d", readCount, totalCount, available);
                    log.debug(dbg);
                    setForDeleteNow = true;
                }
            }
            //
            if (!newMessage && config.isDeleteOld() && !setForDeleteNow && !config.isReadOnly()) {
                message.setFlag(Flags.Flag.DELETED, true);
                log.debug("Deleting message: #{}", totalCount);
            }
        } catch (javax.mail.FolderClosedException connErr) {
            ++errCount;
        } catch (MessagingException me) {
            log.error("Failed to read messsage #{}", totalCount, me);
            ++errCount;
        } catch (IOException writeErr) {
            log.error("Failed to write msg.eml #{}", totalCount, writeErr);
            ++errCount;
        }
    }
    // Error on close is likely rare.
    try {
        disconnect();
    } catch (Exception javaMailErrOnClose) {
        log.error("Unkosher disconnect", javaMailErrOnClose);
    }
}
Also used : Message(javax.mail.Message) MessagingException(javax.mail.MessagingException) IOException(java.io.IOException) File(java.io.File) Date(java.util.Date) ConfigException(org.opensextant.ConfigException) MessagingException(javax.mail.MessagingException) IOException(java.io.IOException)

Example 3 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class OutlookPSTCrawler method configure.

/**
     * Beware -- you can set the path for the PST output (outputPSTDir) or you can set the path its parent path (outputDir).
     * Outside apps may want to control the path setup.   To use the default,  setOutputDir(); configure();
     * @throws ConfigException if output folder could not be set
     */
public void configure() throws ConfigException {
    if (outputPSTDir == null) {
        if (outputDir == null) {
            throw new ConfigException("Output Dir is not configured");
        }
        if (outputDir.exists()) {
            outputPSTDir = new File(String.format("%s/%s", outputDir.getAbsolutePath(), defaultOutputName));
            if (!incrementalMode && outputPSTDir.exists()) {
                throw new ConfigException("Output Dir contains target, but you are not in overwrite mode");
            }
            if (!outputPSTDir.exists()) {
                try {
                    FileUtility.makeDirectory(outputPSTDir);
                } catch (IOException err) {
                    throw new ConfigException("Unable to create target", err);
                }
            }
        } else {
            throw new ConfigException("Please create containing output directory. DIR does not exist:" + outputDir.getAbsolutePath());
        }
    }
    log.info(" Input: PST =  " + pst.getAbsolutePath());
    log.info(" Modes: Incremental =" + incrementalMode);
    log.info(" Modes: Overwrite =" + overwriteMode);
    log.info(" Output: Target " + outputPSTDir);
}
Also used : ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException) File(java.io.File) PSTFile(com.pff.PSTFile)

Example 4 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class TestTikaPST method main.

/** Compare Tika's PST conversion to XText non-Tika PST conversion.
     */
public static void main(String[] args) {
    // Path to a PST.  NOTE, java-libpst provides some good test data.
    String input = args[0];
    try {
        XText xt = new XText();
        xt.enableSaving(true);
        xt.enableTikaPST(true);
        // creates a ./text/ Folder locally in directory.
        xt.getPathManager().enableSaveWithInput(false);
        xt.clearSettings();
        xt.convertFileType("pst");
        xt.getPathManager().setConversionCache("./xtext-testing");
        xt.setup();
        xt.extractText(input);
    } catch (IOException ioerr) {
        ioerr.printStackTrace();
        System.err.println("IO issue" + ioerr.getMessage());
    } catch (ConfigException cfgerr) {
        cfgerr.printStackTrace();
        System.err.println("Config issue" + cfgerr.getMessage());
    }
}
Also used : XText(org.opensextant.xtext.XText) ConfigException(org.opensextant.ConfigException) IOException(java.io.IOException)

Example 5 with ConfigException

use of org.opensextant.ConfigException in project Xponents by OpenSextant.

the class XlayerClientTest method main.

public static void main(String[] args) {
    URL url;
    try {
        url = new URL(args[0]);
        /*
             * Create client.
             */
        XlayerClient c = new XlayerClient(url);
        try {
            /* 
                 * Prepare request.  Text must be UTF-8 encoded.
                 * Note -- readFile() here assumes the file is unicode content
                 * 
                 */
            String text = FileUtility.readFile(args[1]);
            String docid = args[1];
            /*
                 * Process the text and print results to console.
                 * Result is an array of TextMatch objects.  For each particular
                 * TextMatch (Xponents Basic API), you have some common fields related to the 
                 * text found, and then class-specific fields and objects you need to evaluate yourself.
                 * 
                 * The XlayerClient process() method makes use of Transforms helper class to 
                 * digest JSON annotations into Java API TextMatch objects of various flavors.
                 */
            List<TextMatch> results = c.process(docid, text);
            for (TextMatch m : results) {
                System.out.println(String.format("Found %s %s @ (%d:%d)", m.getType(), m.getText(), m.start, m.end));
            }
        } catch (Exception parseErr) {
            parseErr.printStackTrace();
        }
    } catch (MalformedURLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (ConfigException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : MalformedURLException(java.net.MalformedURLException) XlayerClient(org.opensextant.xlayer.XlayerClient) ConfigException(org.opensextant.ConfigException) TextMatch(org.opensextant.extraction.TextMatch) URL(java.net.URL) MalformedURLException(java.net.MalformedURLException) ConfigException(org.opensextant.ConfigException)

Aggregations

ConfigException (org.opensextant.ConfigException)28 IOException (java.io.IOException)20 File (java.io.File)5 URL (java.net.URL)4 MimeTypeParseException (javax.activation.MimeTypeParseException)3 MessagingException (javax.mail.MessagingException)3 SolrServerException (org.apache.solr.client.solrj.SolrServerException)3 PlaceGeocoder (org.opensextant.extractors.geo.PlaceGeocoder)3 XText (org.opensextant.xtext.XText)3 PSTFile (com.pff.PSTFile)2 MalformedURLException (java.net.MalformedURLException)2 HashSet (java.util.HashSet)2 TextMatch (org.opensextant.extraction.TextMatch)2 PersonNameFilter (org.opensextant.extractors.geo.rules.PersonNameFilter)2 TaxonMatcher (org.opensextant.extractors.xtax.TaxonMatcher)2 XTemporal (org.opensextant.extractors.xtemporal.XTemporal)2 PSTException (com.pff.PSTException)1 LongOpt (gnu.getopt.LongOpt)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1