Search in sources :

Example 1 with NotStoredException

use of io.anserini.index.NotStoredException in project Anserini by castorini.

the class ExtractAverageDocumentLength method main.

public static void main(String[] args) throws Exception {
    ExtractAverageDocumentLength.Args myArgs = new ExtractAverageDocumentLength.Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println(String.format("Example: %s %s", ExtractAverageDocumentLength.class.getSimpleName(), parser.printExample(OptionHandlerFilter.REQUIRED)));
        return;
    }
    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    if (reader.leaves().size() != 1) {
        System.err.println("There should be only one leaf, index the collection using the -optimize flag.");
        return;
    }
    LeafReader leafReader = reader.leaves().get(0).reader();
    System.out.println("# Exact avg doclength");
    System.out.println("SumTotalTermFreq: " + leafReader.getSumTotalTermFreq(myArgs.field));
    System.out.println("DocCount:         " + leafReader.getDocCount(myArgs.field));
    System.out.println("avg doclength:    " + ((float) leafReader.getSumTotalTermFreq(myArgs.field) / (float) leafReader.getDocCount(myArgs.field)));
    long sumDoclengths = 0;
    for (LeafReaderContext context : reader.leaves()) {
        leafReader = context.reader();
        NumericDocValues docValues = leafReader.getNormValues(myArgs.field);
        if (docValues == null) {
            throw new NotStoredException("Norms do not appear to have been indexed!");
        }
        while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            sumDoclengths += SmallFloat.byte4ToInt((byte) docValues.longValue());
        }
    }
    System.out.println("\n# Lossy avg doclength, based on sum of norms (lossy doclength) of each doc");
    System.out.println("SumTotalTermFreq: " + sumDoclengths);
    System.out.println("DocCount:         " + leafReader.getDocCount(myArgs.field));
    System.out.println("avg doclength:    " + ((float) sumDoclengths / (float) leafReader.getDocCount(myArgs.field)));
    reader.close();
    dir.close();
}
Also used : NumericDocValues(org.apache.lucene.index.NumericDocValues) IndexArgs(io.anserini.index.IndexArgs) CmdLineParser(org.kohsuke.args4j.CmdLineParser) LeafReader(org.apache.lucene.index.LeafReader) NotStoredException(io.anserini.index.NotStoredException) IndexReader(org.apache.lucene.index.IndexReader) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) CmdLineException(org.kohsuke.args4j.CmdLineException) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 2 with NotStoredException

use of io.anserini.index.NotStoredException in project Anserini by castorini.

the class EndToEndTest method checkIndex.

@Test
public void checkIndex() throws IOException {
    // Subclasses will override this method and provide the ground truth.
    setCheckIndexGroundTruth();
    ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
    Directory dir = FSDirectory.open(Paths.get(this.indexPath));
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(docCount, reader.maxDoc());
    for (int i = 0; i < reader.maxDoc(); i++) {
        String collectionDocid = IndexReaderUtils.convertLuceneDocidToDocid(reader, i);
        if (referenceDocs.get(collectionDocid).get("raw") != null) {
            assertEquals(referenceDocs.get(collectionDocid).get("raw"), IndexReaderUtils.documentRaw(reader, collectionDocid));
        }
        if (referenceDocs.get(collectionDocid).get("contents") != null) {
            assertEquals(referenceDocs.get(collectionDocid).get("contents"), IndexReaderUtils.documentContents(reader, collectionDocid));
        }
        // check list of tokens by calling document vector
        if (!referenceDocTokens.isEmpty()) {
            try {
                List<String> docTokens = IndexReaderUtils.getDocumentTokens(reader, collectionDocid);
                assertEquals(referenceDocTokens.get(collectionDocid).get("contents"), docTokens);
            } catch (NotStoredException e) {
                e.printStackTrace();
            }
        }
    }
    reader.close();
    CheckIndex checker = new CheckIndex(dir);
    checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8));
    if (VERBOSE)
        checker.setInfoStream(System.out);
    CheckIndex.Status indexStatus = checker.checkIndex();
    if (!indexStatus.clean) {
        System.out.println("CheckIndex failed");
        System.out.println(bos.toString(IOUtils.UTF_8));
        fail();
    }
    final CheckIndex.Status.SegmentInfoStatus seg = indexStatus.segmentInfos.get(0);
    assertTrue(seg.openReaderPassed);
    assertNotNull(seg.diagnostics);
    assertNotNull(seg.fieldNormStatus);
    assertNull(seg.fieldNormStatus.error);
    assertEquals(this.fieldNormStatusTotalFields, seg.fieldNormStatus.totFields);
    assertNotNull(seg.termIndexStatus);
    assertNull(seg.termIndexStatus.error);
    assertEquals(this.termIndexStatusTermCount, seg.termIndexStatus.termCount);
    assertEquals(this.termIndexStatusTotFreq, seg.termIndexStatus.totFreq);
    assertEquals(this.termIndexStatusTotPos, seg.termIndexStatus.totPos);
    assertNotNull(seg.storedFieldStatus);
    assertNull(seg.storedFieldStatus.error);
    assertEquals(this.storedFieldStatusTotalDocCounts, seg.storedFieldStatus.docCount);
    assertEquals(this.storedFieldStatusTotFields, seg.storedFieldStatus.totFields);
    assertTrue(seg.diagnostics.size() > 0);
    final List<String> onlySegments = new ArrayList<>();
    onlySegments.add("_0");
    assertTrue(checker.checkIndex(onlySegments).clean);
    checker.close();
}
Also used : PrintStream(java.io.PrintStream) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) NotStoredException(io.anserini.index.NotStoredException) IndexReader(org.apache.lucene.index.IndexReader) CheckIndex(org.apache.lucene.index.CheckIndex) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Test(org.junit.Test)

Example 3 with NotStoredException

use of io.anserini.index.NotStoredException in project Anserini by castorini.

the class ExtractNorms method main.

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println(String.format("Example: %s %s", ExtractNorms.class.getSimpleName(), parser.printExample(OptionHandlerFilter.REQUIRED)));
        return;
    }
    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    out.println("docid\tnorm");
    for (LeafReaderContext context : reader.leaves()) {
        LeafReader leafReader = context.reader();
        NumericDocValues docValues = leafReader.getNormValues("contents");
        if (docValues == null) {
            throw new NotStoredException("Norms do not appear to have been indexed!");
        }
        while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            out.println(String.format("%d\t%d", docValues.docID() + context.docBase, SmallFloat.byte4ToInt((byte) docValues.longValue())));
        }
    }
    out.flush();
    out.close();
    reader.close();
    dir.close();
}
Also used : PrintStream(java.io.PrintStream) NumericDocValues(org.apache.lucene.index.NumericDocValues) CmdLineParser(org.kohsuke.args4j.CmdLineParser) LeafReader(org.apache.lucene.index.LeafReader) NotStoredException(io.anserini.index.NotStoredException) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Aggregations

NotStoredException (io.anserini.index.NotStoredException)3 IndexReader (org.apache.lucene.index.IndexReader)3 Directory (org.apache.lucene.store.Directory)3 FSDirectory (org.apache.lucene.store.FSDirectory)3 PrintStream (java.io.PrintStream)2 LeafReader (org.apache.lucene.index.LeafReader)2 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)2 NumericDocValues (org.apache.lucene.index.NumericDocValues)2 CmdLineException (org.kohsuke.args4j.CmdLineException)2 CmdLineParser (org.kohsuke.args4j.CmdLineParser)2 IndexArgs (io.anserini.index.IndexArgs)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 ArrayList (java.util.ArrayList)1 CheckIndex (org.apache.lucene.index.CheckIndex)1 Test (org.junit.Test)1