Search in sources :

Example 1 with ParsePathType

use of org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType in project lucene-solr by apache.

the class TrecContentSourceTest method testTrecFeedDirAllTypes.

/** 
   * Open a trec content source over a directory with files of all trec path types and all
   * supported formats - bzip, gzip, txt. 
   */
public void testTrecFeedDirAllTypes() throws Exception {
    Path dataDir = createTempDir("trecFeedAllTypes");
    TestUtil.unzip(getDataInputStream("trecdocs.zip"), dataDir);
    TrecContentSource tcs = new TrecContentSource();
    Properties props = new Properties();
    props.setProperty("print.props", "false");
    props.setProperty("content.source.verbose", "false");
    props.setProperty("content.source.excludeIteration", "true");
    props.setProperty("docs.dir", dataDir.toRealPath().toString().replace('\\', '/'));
    props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
    props.setProperty("content.source.forever", "false");
    tcs.setConfig(new Config(props));
    tcs.resetInputs();
    DocData dd = new DocData();
    int n = 0;
    boolean gotExpectedException = false;
    HashSet<ParsePathType> unseenTypes = new HashSet<>(Arrays.asList(ParsePathType.values()));
    try {
        while (n < 100) {
            // arbiterary limit to prevent looping forever in case of test failure
            dd = tcs.getNextDocData(dd);
            ++n;
            assertNotNull("doc data " + n + " should not be null!", dd);
            unseenTypes.remove(tcs.currPathType);
            switch(tcs.currPathType) {
                case GOV2:
                    assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
                    break;
                case FBIS:
                    assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
                    break;
                case FR94:
                    // no title extraction in this source for now
                    assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
                    break;
                case FT:
                    assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
                    break;
                case LATIMES:
                    assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
                    break;
                default:
                    assertTrue("Should never get here!", false);
            }
        }
    } catch (NoMoreDataException e) {
        gotExpectedException = true;
    }
    assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
    assertEquals("Wrong number of documents created by source!", 5, n);
    assertTrue("Did not see all types!", unseenTypes.isEmpty());
}
Also used : Path(java.nio.file.Path) Config(org.apache.lucene.benchmark.byTask.utils.Config) ParsePathType(org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType) Properties(java.util.Properties) HashSet(java.util.HashSet)

Example 2 with ParsePathType

use of org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType in project lucene-solr by apache.

the class TrecContentSource method getNextDocData.

@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String name = null;
    StringBuilder docBuf = getDocBuffer();
    ParsePathType parsedPathType;
    // method, i.e., parsing the content and returning the DocData can run unprotected.
    synchronized (lock) {
        if (reader == null) {
            openNextFile();
        }
        // 1. skip until doc start - required for all TREC formats
        docBuf.setLength(0);
        read(docBuf, DOC, false, false);
        // save parsedFile for passing trecDataParser after the sync block, in 
        // case another thread will open another file in between.
        parsedPathType = currPathType;
        // 2. name - required for all TREC formats
        docBuf.setLength(0);
        read(docBuf, DOCNO, true, false);
        name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, DOCNO.length())).trim();
        if (!excludeDocnameIteration) {
            name = name + "_" + iteration;
        }
        // 3. read all until end of doc
        docBuf.setLength(0);
        read(docBuf, TERMINATING_DOC, false, true);
    }
    // count char length of text to be parsed (may be larger than the resulted plain doc body text).
    addBytes(docBuf.length());
    // This code segment relies on HtmlParser being thread safe. When we get 
    // here, everything else is already private to that thread, so we're safe.
    docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
    addItem();
    return docData;
}
Also used : ParsePathType(org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType)

Aggregations

ParsePathType (org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType)2 Path (java.nio.file.Path)1 HashSet (java.util.HashSet)1 Properties (java.util.Properties)1 Config (org.apache.lucene.benchmark.byTask.utils.Config)1