Search in sources :

Example 1 with BomStrippingInputStreamReader

use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.

the class PersistenceManager method isXmlApplicationFile.

/**
 * Determine whether the URL contains a GATE application serialized
 * using XML.
 *
 * @param url The URL to check.
 * @return true if the URL refers to an xml serialized application,
 *         false otherwise.
 */
private static boolean isXmlApplicationFile(URL url) throws java.io.IOException {
    if (DEBUG) {
        System.out.println("Checking whether file is xml");
    }
    String firstLine;
    BufferedReader fileReader = null;
    try {
        fileReader = new BomStrippingInputStreamReader(url.openStream());
        firstLine = fileReader.readLine();
    } finally {
        if (fileReader != null)
            fileReader.close();
    }
    if (firstLine == null) {
        return false;
    }
    for (String startOfXml : STARTOFXMLAPPLICATIONFILES) {
        if (firstLine.length() >= startOfXml.length() && firstLine.substring(0, startOfXml.length()).equals(startOfXml)) {
            if (DEBUG) {
                System.out.println("isXMLApplicationFile = true");
            }
            return true;
        }
    }
    if (DEBUG) {
        System.out.println("isXMLApplicationFile = false");
    }
    return false;
}
Also used : BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader)

Example 2 with BomStrippingInputStreamReader

use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.

the class GazetteerLists method load.

/**
 * Parse the definition and populate the array of list names.
 */
private void load() {
    log("Listing gazetteer lists", Project.MSG_VERBOSE);
    if (definition == null) {
        throw new BuildException("\"definition\" attribute is required for gazetteerlists");
    }
    log("definition file: " + definition, Project.MSG_VERBOSE);
    Set<String> lists = new HashSet<String>();
    BufferedReader in = null;
    try {
        if (encoding == null) {
            in = new BomStrippingInputStreamReader(new FileInputStream(definition));
        } else {
            in = new BomStrippingInputStreamReader(new FileInputStream(definition), encoding);
        }
        String line;
        while ((line = in.readLine()) != null) {
            int indexOfColon = line.indexOf(':');
            // Ignore lines that don't include a colon.
            if (indexOfColon > 0) {
                String listFile = line.substring(0, indexOfColon);
                lists.add(listFile);
                log("Found list file " + listFile, Project.MSG_VERBOSE);
            }
        }
    } catch (IOException ioe) {
        throw new BuildException("Error reading gazetteer definition file " + definition, ioe);
    } finally {
        IOUtils.closeQuietly(in);
    }
    listNames = lists.toArray(new String[lists.size()]);
}
Also used : BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader) BuildException(org.apache.tools.ant.BuildException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) HashSet(java.util.HashSet)

Example 3 with BomStrippingInputStreamReader

use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.

the class CorpusImpl method populate.

/**
 * Fills the provided corpus with documents extracted from the
 * provided trec file.
 *
 * @param corpus the corpus to be populated.
 * @param singleConcatenatedFile the trec file.
 * @param documentRootElement text between this element (start and
 *          end) is considered for creating a new document.
 * @param encoding the encoding of the trec file.
 * @param numberOfDocumentsToExtract extracts the specified number of
 *          documents from the trecweb file; -1 to indicate all files.
 * @param mimeType the mime type which determines how the document is handled
 * @return total length of populated documents in the corpus in number
 *         of bytes
 * @throws java.io.IOException
 */
public static long populate(Corpus corpus, URL singleConcatenatedFile, String documentRootElement, String encoding, int numberOfDocumentsToExtract, String documentNamePrefix, String mimeType, boolean includeRootElement) throws IOException {
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // obtain the root element that user has provided
    // content between the start and end of root element is considered
    // for creating documents
    documentRootElement = documentRootElement.toLowerCase();
    // document name prefix could be an empty string
    documentNamePrefix = documentNamePrefix == null ? "" : documentNamePrefix.trim() + "_";
    // we start a new document when we find <documentRootElement> and
    // close it when we find </documentRootElement>
    BufferedReader br = null;
    try {
        if (encoding != null && encoding.trim().length() != 0) {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), encoding, 10485760);
        } else {
            br = new BomStrippingInputStreamReader(singleConcatenatedFile.openStream(), 10485760);
        }
        // reading line by line
        String line = br.readLine();
        // this is where we store document content
        StringBuilder documentString = new StringBuilder();
        // toggle switch to indicate search for start element
        boolean searchingForStartElement = true;
        // keeping count of number of documents extracted
        int count = 1;
        // length in bytes read so far (to return)
        long lengthInBytes = 0;
        // continue until reached the end of file
        while (line != null) {
            // lowercase the line in order to match documentRootElement in any case
            String lowerCasedLine = line.toLowerCase();
            // if searching for startElement?
            if (searchingForStartElement) {
                // may be its with attributes
                int index = lowerCasedLine.indexOf("<" + documentRootElement + " ");
                // may be no attributes?
                if (index == -1) {
                    index = lowerCasedLine.indexOf("<" + documentRootElement + ">");
                }
                // skip the current line and start reading from the next line
                if (index != -1) {
                    // if found, that's the first line
                    line = line.substring(index);
                    searchingForStartElement = false;
                } else {
                    line = br.readLine();
                }
            } else {
                // now searching for last element
                int index = lowerCasedLine.indexOf("</" + documentRootElement + ">");
                // if not found.. this is the content of a new document
                if (index == -1) {
                    documentString.append(line + "\n");
                    line = br.readLine();
                } else {
                    // found.. then end the document
                    documentString.append(line.substring(0, index + documentRootElement.length() + 3));
                    // getting ready for the next document
                    searchingForStartElement = true;
                    // here lets create a new document create the doc
                    if (sListener != null)
                        sListener.statusChanged("Creating Document Number :" + count);
                    String docName = documentNamePrefix + count + "_" + Gate.genSym();
                    String docContent = documentString.toString();
                    if (!includeRootElement)
                        docContent = docContent.substring(docContent.indexOf(">") + 1, docContent.lastIndexOf("<"));
                    FeatureMap params = Factory.newFeatureMap();
                    if (mimeType != null)
                        params.put(Document.DOCUMENT_MIME_TYPE_PARAMETER_NAME, mimeType);
                    params.put(Document.DOCUMENT_STRING_CONTENT_PARAMETER_NAME, docContent);
                    if (encoding != null && encoding.trim().length() > 0)
                        params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
                    // calculate the length
                    lengthInBytes += docContent.getBytes().length;
                    try {
                        Document doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName);
                        count++;
                        corpus.add(doc);
                        if (corpus.getLRPersistenceId() != null) {
                            // persistent corpus -> unload the document
                            corpus.unloadDocument(doc);
                            Factory.deleteResource(doc);
                        }
                        // already extracted requested num of documents?
                        if ((count - 1) == numberOfDocumentsToExtract)
                            break;
                    } catch (Throwable t) {
                        String nl = Strings.getNl();
                        Err.prln("WARNING: Corpus.populate could not instantiate document" + nl + "  Document name was: " + docName + nl + "  Exception was: " + t + nl + nl);
                        t.printStackTrace();
                    }
                    documentString = new StringBuilder();
                    if (sListener != null)
                        sListener.statusChanged(docName + " created!");
                    line = line.substring(index + documentRootElement.length() + 3);
                    if (line.trim().equals(""))
                        line = br.readLine();
                }
            }
        }
        return lengthInBytes;
    } finally {
        if (br != null)
            br.close();
    }
}
Also used : FeatureMap(gate.FeatureMap) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) BufferedReader(java.io.BufferedReader) StatusListener(gate.event.StatusListener) Document(gate.Document)

Example 4 with BomStrippingInputStreamReader

use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.

the class TestRepositioningInfo method testRepositioningInfo.

// tearDown
/**
 * This method tests if Repositinioning Information works.
 * It creates a document using an xml file with preserveOriginalContent
 * and collectRepositioningInfo options keeping true and which has all
 * sorts of special entities like &amp, &quot etc. + it contains both
 * kind of unix and dos stype new line characters.  It then saves the
 * document to the temporary location on the disk using
 * "save preserving document format" option and then compares the contents of
 * both the original and the temporary document to see if they are equal.
 * @throws java.lang.Exception
 */
public void testRepositioningInfo() throws Exception {
    // here we need to save the document to the file
    String encoding = ((DocumentImpl) doc).getEncoding();
    File outputFile = File.createTempFile("test-inline1", "xml");
    OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
    writer.write(doc.toXml(null, true));
    writer.flush();
    writer.close();
    Reader readerForSource = new BomStrippingInputStreamReader(new URL(testFile).openStream(), encoding);
    Reader readerForDesti = new BomStrippingInputStreamReader(new FileInputStream(outputFile), encoding);
    while (true) {
        int input1 = readerForSource.read();
        int input2 = readerForDesti.read();
        if (input1 < 0 || input2 < 0) {
            assertTrue(input1 < 0 && input2 < 0);
            readerForSource.close();
            readerForDesti.close();
            outputFile.delete();
            return;
        } else {
            assertEquals(input1, input2);
        }
    }
}
Also used : BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) FileOutputStream(java.io.FileOutputStream) Reader(java.io.Reader) BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) OutputStreamWriter(java.io.OutputStreamWriter) DocumentImpl(gate.corpora.DocumentImpl) File(java.io.File) URL(java.net.URL) FileInputStream(java.io.FileInputStream)

Example 5 with BomStrippingInputStreamReader

use of gate.util.BomStrippingInputStreamReader in project gate-core by GateNLP.

the class TestDocument method testLotsOfThings.

// testOriginalContentPreserving()
/**
 * A comprehensive test
 */
public void testLotsOfThings() {
    // check that the test URL is available
    URL u = null;
    try {
        u = new URL(testServer + testDocument1);
    } catch (Exception e) {
        e.printStackTrace(Err.getPrintWriter());
    }
    // get some text out of the test URL
    BufferedReader uReader = null;
    try {
        uReader = new BomStrippingInputStreamReader(u.openStream());
        assertEquals(uReader.readLine(), "<HTML>");
    } catch (UnknownHostException e) {
        // no network connection
        return;
    } catch (IOException e) {
        fail(e.toString());
    }
/*
    Document doc = new TextualDocument(testServer + testDocument1);
    AnnotationGraph ag = new AnnotationGraphImpl();

    Tokeniser t = ...   doc.getContent()
    tokenise doc using java stream tokeniser

    add several thousand token annotation
    select a subset
    */
}
Also used : BomStrippingInputStreamReader(gate.util.BomStrippingInputStreamReader) UnknownHostException(java.net.UnknownHostException) URL(java.net.URL) UnknownHostException(java.net.UnknownHostException)

Aggregations

BomStrippingInputStreamReader (gate.util.BomStrippingInputStreamReader)5 BufferedReader (java.io.BufferedReader)3 FileInputStream (java.io.FileInputStream)2 URL (java.net.URL)2 Document (gate.Document)1 FeatureMap (gate.FeatureMap)1 DocumentImpl (gate.corpora.DocumentImpl)1 StatusListener (gate.event.StatusListener)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Reader (java.io.Reader)1 UnknownHostException (java.net.UnknownHostException)1 HashSet (java.util.HashSet)1 BuildException (org.apache.tools.ant.BuildException)1