use of io.anserini.index.NotStoredException in project Anserini by castorini.
the class ExtractAverageDocumentLength method main.
public static void main(String[] args) throws Exception {
ExtractAverageDocumentLength.Args myArgs = new ExtractAverageDocumentLength.Args();
CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println(String.format("Example: %s %s", ExtractAverageDocumentLength.class.getSimpleName(), parser.printExample(OptionHandlerFilter.REQUIRED)));
return;
}
Directory dir = FSDirectory.open(Paths.get(myArgs.index));
IndexReader reader = DirectoryReader.open(dir);
if (reader.leaves().size() != 1) {
System.err.println("There should be only one leaf, index the collection using the -optimize flag.");
return;
}
LeafReader leafReader = reader.leaves().get(0).reader();
System.out.println("# Exact avg doclength");
System.out.println("SumTotalTermFreq: " + leafReader.getSumTotalTermFreq(myArgs.field));
System.out.println("DocCount: " + leafReader.getDocCount(myArgs.field));
System.out.println("avg doclength: " + ((float) leafReader.getSumTotalTermFreq(myArgs.field) / (float) leafReader.getDocCount(myArgs.field)));
long sumDoclengths = 0;
for (LeafReaderContext context : reader.leaves()) {
leafReader = context.reader();
NumericDocValues docValues = leafReader.getNormValues(myArgs.field);
if (docValues == null) {
throw new NotStoredException("Norms do not appear to have been indexed!");
}
while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
sumDoclengths += SmallFloat.byte4ToInt((byte) docValues.longValue());
}
}
System.out.println("\n# Lossy avg doclength, based on sum of norms (lossy doclength) of each doc");
System.out.println("SumTotalTermFreq: " + sumDoclengths);
System.out.println("DocCount: " + leafReader.getDocCount(myArgs.field));
System.out.println("avg doclength: " + ((float) sumDoclengths / (float) leafReader.getDocCount(myArgs.field)));
reader.close();
dir.close();
}
use of io.anserini.index.NotStoredException in project Anserini by castorini.
the class EndToEndTest method checkIndex.
@Test
public void checkIndex() throws IOException {
// Subclasses will override this method and provide the ground truth.
setCheckIndexGroundTruth();
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
Directory dir = FSDirectory.open(Paths.get(this.indexPath));
IndexReader reader = DirectoryReader.open(dir);
assertEquals(docCount, reader.maxDoc());
for (int i = 0; i < reader.maxDoc(); i++) {
String collectionDocid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
if (referenceDocs.get(collectionDocid).get("raw") != null) {
assertEquals(referenceDocs.get(collectionDocid).get("raw"), IndexReaderUtils.documentRaw(reader, collectionDocid));
}
if (referenceDocs.get(collectionDocid).get("contents") != null) {
assertEquals(referenceDocs.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid));
}
// check list of tokens by calling document vector
if (!referenceDocTokens.isEmpty()) {
try {
List<String> docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid);
assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens);
} catch (NotStoredException e) {
e.printStackTrace();
}
}
}
reader.close();
CheckIndex checker = new CheckIndex(dir);
checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8));
if (VERBOSE)
checker.setInfoStream(System.out);
CheckIndex.Status indexStatus = checker.checkIndex();
if (!indexStatus.clean) {
System.out.println("CheckIndex failed");
System.out.println(bos.toString(IOUtils.UTF_8));
fail();
}
final CheckIndex.Status.SegmentInfoStatus seg = indexStatus.segmentInfos.get(0);
assertTrue(seg.openReaderPassed);
assertNotNull(seg.diagnostics);
assertNotNull(seg.fieldNormStatus);
assertNull(seg.fieldNormStatus.error);
assertEquals(this.fieldNormStatusTotalFields, seg.fieldNormStatus.totFields);
assertNotNull(seg.termIndexStatus);
assertNull(seg.termIndexStatus.error);
assertEquals(this.termIndexStatusTermCount, seg.termIndexStatus.termCount);
assertEquals(this.termIndexStatusTotFreq, seg.termIndexStatus.totFreq);
assertEquals(this.termIndexStatusTotPos, seg.termIndexStatus.totPos);
assertNotNull(seg.storedFieldStatus);
assertNull(seg.storedFieldStatus.error);
assertEquals(this.storedFieldStatusTotalDocCounts, seg.storedFieldStatus.docCount);
assertEquals(this.storedFieldStatusTotFields, seg.storedFieldStatus.totFields);
assertTrue(seg.diagnostics.size() > 0);
final List<String> onlySegments = new ArrayList<>();
onlySegments.add("_0");
assertTrue(checker.checkIndex(onlySegments).clean);
checker.close();
}
use of io.anserini.index.NotStoredException in project Anserini by castorini.
the class ExtractNorms method main.
public static void main(String[] args) throws Exception {
Args myArgs = new Args();
CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println(String.format("Example: %s %s", ExtractNorms.class.getSimpleName(), parser.printExample(OptionHandlerFilter.REQUIRED)));
return;
}
Directory dir = FSDirectory.open(Paths.get(myArgs.index));
IndexReader reader = DirectoryReader.open(dir);
PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
out.println("docid\tnorm");
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
NumericDocValues docValues = leafReader.getNormValues("contents");
if (docValues == null) {
throw new NotStoredException("Norms do not appear to have been indexed!");
}
while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
out.println(String.format("%d\t%d", docValues.docID() + context.docBase, SmallFloat.byte4ToInt((byte) docValues.longValue())));
}
}
out.flush();
out.close();
reader.close();
dir.close();
}
Aggregations