use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.
the class PersistenceManager method isXmlApplicationFile.
/**
* Determine whether the URL contains a GATE application serialized
* using XML.
*
* @param url The URL to check.
* @return true if the URL refers to an xml serialized application,
* false otherwise.
*/
private static boolean isXmlApplicationFile(URL url) throws java.io.IOException {
if (DEBUG) {
System.out.println("Checking whether file is xml");
}
String firstLine;
BufferedReader fileReader = null;
try {
fileReader = new BomStrippingInputStreamReader(url.openStream());
firstLine = fileReader.readLine();
} finally {
if (fileReader != null)
fileReader.close();
}
if (firstLine == null) {
return false;
}
for (String startOfXml : STARTOFXMLAPPLICATIONFILES) {
if (firstLine.length() >= startOfXml.length() && firstLine.substring(0, startOfXml.length()).equals(startOfXml)) {
if (DEBUG) {
System.out.println("isXMLApplicationFile = true");
}
return true;
}
}
if (DEBUG) {
System.out.println("isXMLApplicationFile = false");
}
return false;
}
use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.
the class GazetteerLists method load.
/**
* Parse the definition and populate the array of list names.
*/
private void load() {
log("Listing gazetteer lists", Project.MSG_VERBOSE);
if (definition == null) {
throw new BuildException("\"definition\" attribute is required for gazetteerlists");
}
log("definition file: " + definition, Project.MSG_VERBOSE);
Set<String> lists = new HashSet<String>();
BufferedReader in = null;
try {
if (encoding == null) {
in = new BomStrippingInputStreamReader(new FileInputStream(definition));
} else {
in = new BomStrippingInputStreamReader(new FileInputStream(definition), encoding);
}
String line;
while ((line = in.readLine()) != null) {
int indexOfColon = line.indexOf(':');
// Ignore lines that don't include a colon.
if (indexOfColon > 0) {
String listFile = line.substring(0, indexOfColon);
lists.add(listFile);
log("Found list file " + listFile, Project.MSG_VERBOSE);
}
}
} catch (IOException ioe) {
throw new BuildException("Error reading gazetteer definition file " + definition, ioe);
} finally {
IOUtils.closeQuietly(in);
}
listNames = lists.toArray(new String[lists.size()]);
}
use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.
the class CorpusImpl method populate.
/**
* Fills the provided corpus with documents extracted from the
* provided trec file.
*
* @param corpus the corpus to be populated.
* @param singleConcatenatedFile the trec file.
* @param documentRootElement text between this element (start and
* end) is considered for creating a new document.
* @param encoding the encoding of the trec file.
* @param numberOfDocumentsToExtract extracts the specified number of
* documents from the trecweb file; -1 to indicate all files.
* @param mimeType the mime type which determines how the document is handled
* @return total length of populated documents in the corpus in number
* of bytes
* @throws java.io.IOException
*/
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// obtain the root element that user has provided
// content between the start and end of root element is considered
// for creating documents
documentRootElement = documentRootElement.toLowerCase();
// document name prefix could be an empty string
documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
// we start a new document when we find <documentRootElement> and
// close it when we find </documentRootElement>
BufferedReader br = null;
try {
if (encoding != null && encoding.trim().length() != 0) {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
} else {
br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
}
// reading line by line
String line = br.readLine();
// this is where we store document content
StringBuilder documentString = new StringBuilder();
// toggle switch to indicate search for start element
boolean searchingForStartElement = true;
// keeping count of number of documents extracted
int count = 1;
// length in bytes read so far (to return)
long lengthInBytes = 0;
// continue until reached the end of file
while (line != null) {
// lowercase the line in order to match documentRootElement in any case
String lowerCasedLine = line.toLowerCase();
// if searching for startElement?
if (searchingForStartElement) {
// may be its with attributes
int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
// may be no attributes?
if (index == -1) {
index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
}
// skip the current line and start reading from the next line
if (index != -1) {
// if found, that's the first line
line = line.substring(index);
searchingForStartElement = false;
} else {
line = br.readLine();
}
} else {
// now searching for last element
int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
// if not found.. this is the content of a new document
if (index == -1) {
documentString.append(line + "\n");
line = br.readLine();
} else {
// found.. then end the document
documentString.append(line.substring(0, index + documentRootElement.length() + 3));
// getting ready for the next document
searchingForStartElement = true;
// here lets create a new document create the doc
if (sListener != null)
sListener.statusChanged("Creating Document Number :" + count);
String docName = documentNamePrefix + count + "_" + Gate.genSym();
String docContent = documentString.toString();
if (!includeRootElement)
docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
FeatureMap params = Factory.newFeatureMap();
if (mimeType != null)
params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
if (encoding != null && encoding.trim().length() > 0)
params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
// calculate the length
lengthInBytes += docContent.getBytes().length;
try {
Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
count++;
corpus.add(doc);
if (corpus.getLRPersistenceId() != null) {
// persistent corpus -> unload the document
corpus.unloadDocument(doc);
Factory.deleteResource(doc);
}
// already extracted requested num of documents?
if ((count - 1) == numberOfDocumentsToExtract)
break;
} catch (Throwable t) {
String nl = Strings.getNl();
Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + " Document name was: " + docName + nl + " Exception was: " + t + nl + nl);
t.printStackTrace();
}
documentString = new StringBuilder();
if (sListener != null)
sListener.statusChanged(docName + " created!");
line = line.substring(index + documentRootElement.length() + 3);
if (line.trim().equals(""))
line = br.readLine();
}
}
}
return lengthInBytes;
} finally {
if (br != null)
br.close();
}
}
use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.
the class TestRepositioningInfo method testRepositioningInfo.
// tearDown
/**
* This method tests if Repositinioning Information works.
* It creates a document using an xml file with preserveOriginalContent
* and collectRepositioningInfo options keeping true and which has all
* sorts of special entities like &, " etc. + it contains both
* kind of unix and dos stype new line characters. It then saves the
* document to the temporary location on the disk using
* "save preserving document format" option and then compares the contents of
* both the original and the temporary document to see if they are equal.
* @throws java.lang.Exception
*/
public void testRepositioningInfo() throws Exception {
// here we need to save the document to the file
String encoding = ((DocumentImpl) doc).getEncoding();
File outputFile = File.createTempFile("test-inline1", "xml");
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
writer.write(doc.toXml(null, true));
writer.flush();
writer.close();
Reader readerForSource = new BomStrippingInputStreamReader(new URL(testFile).openStream(), encoding);
Reader readerForDesti = new BomStrippingInputStreamReader(new FileInputStream(outputFile), encoding);
while (true) {
int input1 = readerForSource.read();
int input2 = readerForDesti.read();
if (input1 < 0 || input2 < 0) {
assertTrue(input1 < 0 && input2 < 0);
readerForSource.close();
readerForDesti.close();
outputFile.delete();
return;
} else {
assertEquals(input1, input2);
}
}
}
use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.
the class TestDocument method testLotsOfThings.
// testOriginalContentPreserving()
/**
* A comprehensive test
*/
public void testLotsOfThings() {
// check that the test URL is available
URL u = null;
try {
u = new URL(testServer + testDocument1);
} catch (Exception e) {
e.printStackTrace(Err.getPrintWriter());
}
// get some text out of the test URL
BufferedReader uReader = null;
try {
uReader = new BomStrippingInputStreamReader(u.openStream());
assertEquals(uReader.readLine(), "<HTML>");
} catch (UnknownHostException e) {
// no network connection
return;
} catch (IOException e) {
fail(e.toString());
}
/*
Document doc = new TextualDocument(testServer + testDocument1);
AnnotationGraph ag = new AnnotationGraphImpl();
Tokeniser t = ... doc.getContent()
tokenise doc using java stream tokeniser
add several thousand token annotation
select a subset
*/
}
Aggregations