Search in sources :

Example 1 with StreamSource

use of org.opengrok.indexer.analysis.StreamSource in project OpenGrok by OpenGrok.

the class StreamUtils method sourceFromEmbedded.

/**
 * Creates a {@code StreamSource} instance that reads data from an
 * embedded resource.
 * @param resourceName a required resource name
 * @return a stream source that reads from {@code name}
 */
public static StreamSource sourceFromEmbedded(String resourceName) {
    return new StreamSource() {

        @Override
        public InputStream getStream() {
            InputStream res = StreamUtils.class.getClassLoader().getResourceAsStream(resourceName);
            assertNotNull(res, "resource " + resourceName);
            return new BufferedInputStream(res);
        }
    };
}
Also used : BufferedInputStream(java.io.BufferedInputStream) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) StreamSource(org.opengrok.indexer.analysis.StreamSource)

Example 2 with StreamSource

use of org.opengrok.indexer.analysis.StreamSource in project OpenGrok by OpenGrok.

the class SourceSplitterTest method shouldHandleStreamedDocsOfLongerLength.

@Test
public void shouldHandleStreamedDocsOfLongerLength() throws IOException {
    // 0             0
    // 0-- -  5-- - -1--- - 5--- - 2-
    final String INPUT = "ab\r\ncde\r\nefgh\r\nijk\r\nlm";
    StreamSource src = StreamSource.fromString(INPUT);
    SourceSplitter splitter = new SourceSplitter();
    splitter.reset(src);
    assertEquals(5, splitter.count(), "split count");
    assertEquals(0, splitter.getOffset(0), "split offset");
    assertEquals(4, splitter.getOffset(1), "split offset");
    assertEquals(9, splitter.getOffset(2), "split offset");
    assertEquals(15, splitter.getOffset(3), "split offset");
    assertEquals(20, splitter.getOffset(4), "split offset");
    assertEquals(22, splitter.getOffset(5), "split offset");
    /*
         * Test findLineIndex() for every character with an alternate
         * computation that counts every LF.
         */
    for (int i = 0; i < splitter.originalLength(); ++i) {
        char c = INPUT.charAt(i);
        int li = splitter.findLineIndex(i);
        long numLF = INPUT.substring(0, i + 1).chars().filter(ch -> ch == '\n').count();
        long exp = numLF - (c == '\n' ? 1 : 0);
        assertEquals(exp, li, "split find-index of " + i);
    }
}
Also used : Test(org.junit.jupiter.api.Test) IOException(java.io.IOException) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) StreamSource(org.opengrok.indexer.analysis.StreamSource) StreamSource(org.opengrok.indexer.analysis.StreamSource) Test(org.junit.jupiter.api.Test)

Example 3 with StreamSource

use of org.opengrok.indexer.analysis.StreamSource in project OpenGrok by OpenGrok.

the class LineBreakerTest method shouldSplitDocsWithNoLastLF.

@Test
public void shouldSplitDocsWithNoLastLF() throws IOException {
    StreamSource src = StreamSource.fromString("abc\r\ndef");
    brkr.reset(src);
    assertEquals(2, brkr.count(), "split count");
    assertEquals(0, brkr.getOffset(0), "split offset");
    assertEquals(5, brkr.getOffset(1), "split offset");
    assertEquals(8, brkr.getOffset(2), "split offset");
}
Also used : StreamSource(org.opengrok.indexer.analysis.StreamSource) Test(org.junit.jupiter.api.Test)

Example 4 with StreamSource

use of org.opengrok.indexer.analysis.StreamSource in project OpenGrok by OpenGrok.

the class LineBreakerTest method shouldSplitEmptyStringIntoOneLine.

@Test
public void shouldSplitEmptyStringIntoOneLine() throws IOException {
    StreamSource src = StreamSource.fromString("");
    brkr.reset(src);
    assertEquals(1, brkr.count(), "split count");
    assertEquals(0, brkr.getOffset(0), "split offset");
    assertEquals(0, brkr.findLineIndex(0), "split find-index");
    assertEquals(-1, brkr.findLineIndex(1), "split find-index");
}
Also used : StreamSource(org.opengrok.indexer.analysis.StreamSource) Test(org.junit.jupiter.api.Test)

Example 5 with StreamSource

use of org.opengrok.indexer.analysis.StreamSource in project OpenGrok by OpenGrok.

the class GZIPAnalyzer method analyze.

@Override
public void analyze(Document doc, StreamSource src, Writer xrefOut) throws IOException, InterruptedException {
    AbstractAnalyzer fa;
    StreamSource gzSrc = wrap(src);
    String path = doc.get(QueryBuilder.PATH);
    if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) {
        String newname = path.substring(0, path.length() - 3);
        // System.err.println("GZIPPED OF = " + newname);
        try (InputStream gzis = gzSrc.getStream()) {
            fa = AnalyzerGuru.getAnalyzer(gzis, newname);
        }
        if (fa == null) {
            this.g = Genre.DATA;
            LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname);
        // TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
        } else {
            // simple file gziped case captured here
            if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
                this.g = Genre.XREFABLE;
            } else {
                this.g = Genre.DATA;
            }
            fa.analyze(doc, gzSrc, xrefOut);
            if (doc.get(QueryBuilder.T) != null) {
                doc.removeField(QueryBuilder.T);
                if (g == Genre.XREFABLE) {
                    doc.add(new Field(QueryBuilder.T, g.typeName(), AnalyzerGuru.string_ft_stored_nanalyzed_norms));
                }
            }
        }
    }
}
Also used : Field(org.apache.lucene.document.Field) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) AbstractAnalyzer(org.opengrok.indexer.analysis.AbstractAnalyzer) StreamSource(org.opengrok.indexer.analysis.StreamSource)

Aggregations

StreamSource (org.opengrok.indexer.analysis.StreamSource)16 Test (org.junit.jupiter.api.Test)8 InputStream (java.io.InputStream)7 BufferedInputStream (java.io.BufferedInputStream)4 IOException (java.io.IOException)4 BufferedReader (java.io.BufferedReader)2 File (java.io.File)2 Reader (java.io.Reader)2 Field (org.apache.lucene.document.Field)2 AbstractAnalyzer (org.opengrok.indexer.analysis.AbstractAnalyzer)2 ExpandTabsReader (org.opengrok.indexer.analysis.ExpandTabsReader)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStreamReader (java.io.InputStreamReader)1 StringWriter (java.io.StringWriter)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 Document (org.apache.lucene.document.Document)1 BytesRef (org.apache.lucene.util.BytesRef)1 CBZip2InputStream (org.apache.tools.bzip2.CBZip2InputStream)1