use of io.anserini.document.SourceDocument in project Anserini by castorini.
the class LuceneDocumentGenerator method createDocument.
public Document createDocument(SourceDocument src) {
String id = src.id();
String contents;
try {
// If there's a transform, use it.
contents = transform != null ? transform.apply(src.content()) : src.content();
} catch (Exception e) {
LOG.error("Error extracting document text, skipping document: " + id, e);
counters.errors.incrementAndGet();
return null;
}
if (contents.trim().length() == 0) {
LOG.info("Empty document: " + id);
counters.emptyDocuments.incrementAndGet();
return null;
}
// make a new, empty document
Document document = new Document();
// document id
document.add(new StringField(FIELD_ID, id, Field.Store.YES));
if (args.storeRawDocs) {
document.add(new StoredField(FIELD_RAW, src.content()));
}
FieldType fieldType = new FieldType();
fieldType.setStored(args.storeTransformedDocs);
// Are we storing document vectors?
if (args.storeDocvectors) {
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
}
// Are we building a "positional" or "count" index?
if (args.storePositions) {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}
document.add(new Field(FIELD_BODY, contents, fieldType));
return document;
}
Aggregations