use of org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType in project lucene-solr by apache.
the class TrecContentSourceTest method testTrecFeedDirAllTypes.
/**
* Open a trec content source over a directory with files of all trec path types and all
* supported formats - bzip, gzip, txt.
*/
public void testTrecFeedDirAllTypes() throws Exception {
Path dataDir = createTempDir("trecFeedAllTypes");
TestUtil.unzip(getDataInputStream("trecdocs.zip"), dataDir);
TrecContentSource tcs = new TrecContentSource();
Properties props = new Properties();
props.setProperty("print.props", "false");
props.setProperty("content.source.verbose", "false");
props.setProperty("content.source.excludeIteration", "true");
props.setProperty("docs.dir", dataDir.toRealPath().toString().replace('\\', '/'));
props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
props.setProperty("content.source.forever", "false");
tcs.setConfig(new Config(props));
tcs.resetInputs();
DocData dd = new DocData();
int n = 0;
boolean gotExpectedException = false;
HashSet<ParsePathType> unseenTypes = new HashSet<>(Arrays.asList(ParsePathType.values()));
try {
while (n < 100) {
// arbiterary limit to prevent looping forever in case of test failure
dd = tcs.getNextDocData(dd);
++n;
assertNotNull("doc data " + n + " should not be null!", dd);
unseenTypes.remove(tcs.currPathType);
switch(tcs.currPathType) {
case GOV2:
assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
break;
case FBIS:
assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
break;
case FR94:
// no title extraction in this source for now
assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
break;
case FT:
assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
break;
case LATIMES:
assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
break;
default:
assertTrue("Should never get here!", false);
}
}
} catch (NoMoreDataException e) {
gotExpectedException = true;
}
assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
assertEquals("Wrong number of documents created by source!", 5, n);
assertTrue("Did not see all types!", unseenTypes.isEmpty());
}
use of org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType in project lucene-solr by apache.
the class TrecContentSource method getNextDocData.
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String name = null;
StringBuilder docBuf = getDocBuffer();
ParsePathType parsedPathType;
// method, i.e., parsing the content and returning the DocData can run unprotected.
synchronized (lock) {
if (reader == null) {
openNextFile();
}
// 1. skip until doc start - required for all TREC formats
docBuf.setLength(0);
read(docBuf, DOC, false, false);
// save parsedFile for passing trecDataParser after the sync block, in
// case another thread will open another file in between.
parsedPathType = currPathType;
// 2. name - required for all TREC formats
docBuf.setLength(0);
read(docBuf, DOCNO, true, false);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, DOCNO.length())).trim();
if (!excludeDocnameIteration) {
name = name + "_" + iteration;
}
// 3. read all until end of doc
docBuf.setLength(0);
read(docBuf, TERMINATING_DOC, false, true);
}
// count char length of text to be parsed (may be larger than the resulted plain doc body text).
addBytes(docBuf.length());
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addItem();
return docData;
}
Aggregations