use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class XText method main.
public static void main(String[] args) {
LongOpt[] options = { new LongOpt("input", LongOpt.REQUIRED_ARGUMENT, null, 'i'), new LongOpt("output", LongOpt.REQUIRED_ARGUMENT, null, 'o'), new LongOpt("export", LongOpt.REQUIRED_ARGUMENT, null, 'x'), new LongOpt("strip-prefix", LongOpt.REQUIRED_ARGUMENT, null, 'p'), new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'), new LongOpt("clean-html", LongOpt.NO_ARGUMENT, null, 'H'), new LongOpt("embed-conversion", LongOpt.NO_ARGUMENT, null, 'e'), new LongOpt("embed-children", LongOpt.NO_ARGUMENT, null, 'c'), new LongOpt("tika-pst", LongOpt.NO_ARGUMENT, null, 'T') };
// "hcex:i:o:p:"
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("XText", args, "", options);
String input = null;
String output = null;
boolean embed = false;
boolean filter_html = false;
boolean saveChildrenWithInput = false;
String saveChildrenTo = null;
String prefix = null;
XText xt = new XText();
try {
int c;
while ((c = opts.getopt()) != -1) {
switch(c) {
case 0:
break;
case 'i':
input = opts.getOptarg();
break;
case 'o':
output = opts.getOptarg();
break;
case 'H':
filter_html = true;
break;
case 'c':
saveChildrenWithInput = true;
break;
case 'x':
saveChildrenTo = opts.getOptarg();
break;
case 'p':
prefix = opts.getOptarg();
break;
case 'e':
embed = true;
System.out.println("Saving conversions to Input folder. Output folder will be ignored.");
break;
case 'T':
xt.enableTikaPST(true);
break;
case 'h':
default:
XText.usage();
System.exit(1);
}
}
} catch (Exception err) {
XText.usage();
System.exit(1);
}
if (input == null) {
System.out.println("An input argument is required, e.g., -Dinput=/Folder/...");
System.exit(-1);
}
// Setting LANG=en_US in your shell.
//
// System.setProperty("LANG", "en_US");
// Given this is a test application, we will
xt.enableOverwrite(true);
// overwrite every time XText is called.
xt.enableSaving(embed || output != null);
// creates a ./text/ Folder locally in
xt.getPathManager().enableSaveWithInput(embed);
// directory.
xt.enableHTMLScrubber(filter_html);
xt.getPathManager().enableSaveChildrenWithInput(saveChildrenWithInput);
// If user wishes to strip input paths of some prefix
// Output will be dumped in the resulting relative path.
xt.getPathManager().setStripPrefixPath(prefix);
// ... others?
if (!saveChildrenWithInput && saveChildrenTo != null) {
xt.getPathManager().setExtractedChildrenCache(saveChildrenTo);
}
try {
if (!embed) {
if (output == null) {
output = "output";
// Will save to output dir.
xt.enableSaving(true);
FileUtility.makeDirectory(output);
xt.getPathManager().setConversionCache(output);
System.out.println("Default output folder is $PWD/" + output);
} else {
xt.enableSaving(true);
// Notice this main program requires an output path.
xt.getPathManager().setConversionCache(output);
}
}
// Set itself to listen, as this is the main program.
xt.setConversionListener(new MainProgramListener());
xt.setup();
xt.extractText(input);
} catch (IOException ioerr) {
XText.usage();
ioerr.printStackTrace();
} catch (ConfigException cfgerr) {
XText.usage();
cfgerr.printStackTrace();
}
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class DefaultMailCrawl method collect.
/**
* TODO:
*
* pull all mail messages,
* - create reasonable FILE.msg file name
* - use XText to iterate over each msg file for conversion
* - reimplement
*
* @throws IOException on failure to connect or collect.
*/
@Override
public void collect() throws IOException, ConfigException {
File dateFolder = createDateFolder(new Date());
if (dateFolder == null) {
log.error("Unable to create directory: " + dateFolder);
return;
}
Message[] messages = null;
try {
connect();
messages = getMessages();
if (messages == null) {
log.info("No messages available - Exiting MailClient now");
disconnect();
return;
}
} catch (MessagingException javaMailErr) {
throw new IOException("Unable to connect or get messages", javaMailErr);
}
int readCount = 0;
int totalCount = 0;
int available = messages.length;
int errCount = 0;
//
for (Message message : messages) {
++totalCount;
// Exit if too many errors.
if (errCount > 10) {
break;
}
try {
if (config.doneReading(messages.length, readCount)) {
// Done here.
break;
}
/**
* Silently ignore deleted messages; items deleted while we were
* in session
*/
if (message.isExpunged()) {
log.info("Message deleted during session; Unable to collect. Mail Subj: {}", message.getSubject());
continue;
}
boolean newMessage = !message.isSet(Flags.Flag.SEEN);
log.debug("Message Subject: " + message.getSubject() + " new?: " + newMessage);
boolean setForDeleteNow = false;
String subject = message.getSubject();
if (message.getSubject() == null) {
log.info("Empty message title MSG number=" + message.getMessageNumber());
//continue;
subject = "No_Subject";
}
if ((!config.isReadNewMessagesOnly() || newMessage)) {
// 1. Identify the email message.
// and determine if you need to capture it again.
//
String messageFilename = MessageConverter.createSafeFilename(subject);
if (messageFilename.length() > 60) {
messageFilename = messageFilename.substring(0, 60);
}
String msgId = MessageConverter.getMessageID(message);
if (msgId == null) {
log.error("How can a message ID be null? SUBJ={}", message.getSubject());
continue;
}
msgId = MessageConverter.getShorterMessageID(msgId);
try {
if (listener != null && listener.exists(msgId)) {
//
continue;
}
} catch (Exception err1) {
log.error("Collection error with mail.", err1);
continue;
}
readCount++;
if (log.isDebugEnabled()) {
log.debug("Message: {}", message.getSubject());
String msg = String.format("Processing message: %s / %s of available: %s", readCount, totalCount, available);
log.debug(msg);
}
// Save file in archive, Convert it, etc.
int status = saveMessageToFile(dateFolder, message, msgId, messageFilename);
if (status < 0) {
++errCount;
}
if (!config.isReadOnly() && config.isDeleteOnRead()) {
message.setFlag(Flags.Flag.DELETED, true);
String dbg = String.format("Processing message: %d / %d of available:%d", readCount, totalCount, available);
log.debug(dbg);
setForDeleteNow = true;
}
}
//
if (!newMessage && config.isDeleteOld() && !setForDeleteNow && !config.isReadOnly()) {
message.setFlag(Flags.Flag.DELETED, true);
log.debug("Deleting message: #{}", totalCount);
}
} catch (javax.mail.FolderClosedException connErr) {
++errCount;
} catch (MessagingException me) {
log.error("Failed to read messsage #{}", totalCount, me);
++errCount;
} catch (IOException writeErr) {
log.error("Failed to write msg.eml #{}", totalCount, writeErr);
++errCount;
}
}
// Error on close is likely rare.
try {
disconnect();
} catch (Exception javaMailErrOnClose) {
log.error("Unkosher disconnect", javaMailErrOnClose);
}
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class OutlookPSTCrawler method configure.
/**
* Beware -- you can set the path for the PST output (outputPSTDir) or you can set the path its parent path (outputDir).
* Outside apps may want to control the path setup. To use the default, setOutputDir(); configure();
* @throws ConfigException if output folder could not be set
*/
public void configure() throws ConfigException {
if (outputPSTDir == null) {
if (outputDir == null) {
throw new ConfigException("Output Dir is not configured");
}
if (outputDir.exists()) {
outputPSTDir = new File(String.format("%s/%s", outputDir.getAbsolutePath(), defaultOutputName));
if (!incrementalMode && outputPSTDir.exists()) {
throw new ConfigException("Output Dir contains target, but you are not in overwrite mode");
}
if (!outputPSTDir.exists()) {
try {
FileUtility.makeDirectory(outputPSTDir);
} catch (IOException err) {
throw new ConfigException("Unable to create target", err);
}
}
} else {
throw new ConfigException("Please create containing output directory. DIR does not exist:" + outputDir.getAbsolutePath());
}
}
log.info(" Input: PST = " + pst.getAbsolutePath());
log.info(" Modes: Incremental =" + incrementalMode);
log.info(" Modes: Overwrite =" + overwriteMode);
log.info(" Output: Target " + outputPSTDir);
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class TestTikaPST method main.
/** Compare Tika's PST conversion to XText non-Tika PST conversion.
*/
public static void main(String[] args) {
// Path to a PST. NOTE, java-libpst provides some good test data.
String input = args[0];
try {
XText xt = new XText();
xt.enableSaving(true);
xt.enableTikaPST(true);
// creates a ./text/ Folder locally in directory.
xt.getPathManager().enableSaveWithInput(false);
xt.clearSettings();
xt.convertFileType("pst");
xt.getPathManager().setConversionCache("./xtext-testing");
xt.setup();
xt.extractText(input);
} catch (IOException ioerr) {
ioerr.printStackTrace();
System.err.println("IO issue" + ioerr.getMessage());
} catch (ConfigException cfgerr) {
cfgerr.printStackTrace();
System.err.println("Config issue" + cfgerr.getMessage());
}
}
use of org.opensextant.ConfigException in project Xponents by OpenSextant.
the class XlayerClientTest method main.
public static void main(String[] args) {
URL url;
try {
url = new URL(args[0]);
/*
* Create client.
*/
XlayerClient c = new XlayerClient(url);
try {
/*
* Prepare request. Text must be UTF-8 encoded.
* Note -- readFile() here assumes the file is unicode content
*
*/
String text = FileUtility.readFile(args[1]);
String docid = args[1];
/*
* Process the text and print results to console.
* Result is an array of TextMatch objects. For each particular
* TextMatch (Xponents Basic API), you have some common fields related to the
* text found, and then class-specific fields and objects you need to evaluate yourself.
*
* The XlayerClient process() method makes use of Transforms helper class to
* digest JSON annotations into Java API TextMatch objects of various flavors.
*/
List<TextMatch> results = c.process(docid, text);
for (TextMatch m : results) {
System.out.println(String.format("Found %s %s @ (%d:%d)", m.getType(), m.getText(), m.start, m.end));
}
} catch (Exception parseErr) {
parseErr.printStackTrace();
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ConfigException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations