Search in sources :

Example 1 with IndexArgs

use of io.anserini.index.IndexArgs in project Anserini by castorini.

the class CoreGeneratorTest method setUp.

@Before
public void setUp() throws Exception {
    ObjectMapper mapper = new ObjectMapper();
    ObjectNode coreJsonObj = mapper.createObjectNode();
    coreJsonObj.set("coreId", TextNode.valueOf("id_text"));
    coreJsonObj.set("doi", TextNode.valueOf("doi_text"));
    coreJsonObj.set("oai", TextNode.valueOf("oai_text"));
    coreJsonObj.set("title", TextNode.valueOf("every startup ever"));
    coreJsonObj.set("abstract", TextNode.valueOf("machine learning blockchain quantum vr"));
    coreJsonObj.set("year", IntNode.valueOf(2020));
    coreJsonObj.set("authors", mapper.createArrayNode().add("Elon Musk").add("Mark Zuckerberg"));
    coreJsonObj.set("contributors", mapper.createArrayNode());
    coreJsonObj.set("publisher", NullNode.getInstance());
    coreJsonObj.set("datePublished", TextNode.valueOf("2020-01-01"));
    coreJsonObj.set("pdfHashValue", TextNode.valueOf("abc"));
    coreJsonObj.set("downloadUrl", NullNode.getInstance());
    coreJsonObj.set("topics", mapper.createArrayNode().add("Machine Learning").add("Blockchain"));
    coreJsonObj.set("subjects", mapper.createArrayNode().add("Quantum").add("VR"));
    coreJsonObj.set("journals", mapper.createArrayNode().add("journal"));
    coreJsonObj.set("identifiers", mapper.createArrayNode());
    coreJsonObj.set("language", mapper.createObjectNode());
    coreJsonObj.set("relations", mapper.createObjectNode().set("sample", TextNode.valueOf("text")));
    coreJsonObj.set("fullTextIdentifier", NullNode.getInstance());
    coreJsonObj.set("enrichments", ((ObjectNode) mapper.createObjectNode().set("references", mapper.createArrayNode())).set("documentType", ((ObjectNode) mapper.createObjectNode().set("type", NullNode.getInstance())).set("confidence", NullNode.getInstance())));
    coreDoc = new CoreCollection.Document(coreJsonObj);
    CoreGenerator generator = new CoreGenerator(new IndexArgs());
    doc = generator.createDocument(coreDoc);
}
Also used : ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) CoreCollection(io.anserini.collection.CoreCollection) IndexArgs(io.anserini.index.IndexArgs) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Before(org.junit.Before)

Example 2 with IndexArgs

use of io.anserini.index.IndexArgs in project Anserini by castorini.

the class C4EndToEndTest method getIndexArgs.

@Override
protected IndexArgs getIndexArgs() {
    IndexArgs indexArgs = createDefaultIndexArgs();
    indexArgs.input = "src/test/resources/sample_docs/c4";
    indexArgs.collectionClass = C4Collection.class.getSimpleName();
    indexArgs.generatorClass = C4Generator.class.getSimpleName();
    indexArgs.shardCount = 2;
    indexArgs.shardCurrent = 1;
    return indexArgs;
}
Also used : C4Generator(io.anserini.index.generator.C4Generator) C4Collection(io.anserini.collection.C4Collection) IndexArgs(io.anserini.index.IndexArgs)

Example 3 with IndexArgs

use of io.anserini.index.IndexArgs in project Anserini by castorini.

the class EndToEndTest method createDefaultIndexArgs.

protected IndexArgs createDefaultIndexArgs() {
    IndexArgs args = new IndexArgs();
    args.storePositions = true;
    args.storeDocvectors = true;
    args.storeContents = true;
    args.storeRaw = true;
    args.optimize = true;
    args.quiet = true;
    return args;
}
Also used : IndexArgs(io.anserini.index.IndexArgs)

Example 4 with IndexArgs

use of io.anserini.index.IndexArgs in project Anserini by castorini.

the class EndToEndTest method setUp.

@Override
@Before
public void setUp() throws Exception {
    Locale.setDefault(Locale.US);
    // We're going to build an index for every test.
    super.setUp();
    indexPath = "test-index" + RANDOM.nextInt(100000);
    cleanup.clear();
    // Subclasses will override this method and change their own settings.
    IndexArgs indexArgs = getIndexArgs();
    // Note, since we want to test end-to-end, we're going to generate command-line parameters to feed back into main.
    List<String> args = new ArrayList<>(List.of("-index", indexPath, "-input", indexArgs.input, "-threads", "2", "-language", indexArgs.language, "-collection", indexArgs.collectionClass, "-generator", indexArgs.generatorClass));
    if (indexArgs.tweetMaxId != Long.MAX_VALUE) {
        args.add("-tweet.maxId");
        args.add(indexArgs.tweetMaxId + "");
    }
    if (indexArgs.whitelist != null) {
        args.add("-whitelist");
        args.add(indexArgs.whitelist);
    }
    if (indexArgs.storePositions) {
        args.add("-storePositions");
    }
    if (indexArgs.storeDocvectors) {
        args.add("-storeDocvectors");
    }
    if (indexArgs.storeContents) {
        args.add("-storeContents");
    }
    if (indexArgs.storeRaw) {
        args.add("-storeRaw");
    }
    if (indexArgs.keepStopwords) {
        args.add("-keepStopwords");
    }
    if (indexArgs.stopwords != null) {
        args.add("-stopwords");
        args.add(indexArgs.stopwords);
    }
    if (indexArgs.optimize) {
        args.add("-optimize");
    }
    if (indexArgs.quiet) {
        args.add("-quiet");
    }
    if (indexArgs.shardCount > 1) {
        args.add("-shard.count");
        args.add(Integer.toString(indexArgs.shardCount));
        args.add("-shard.current");
        args.add(Integer.toString(indexArgs.shardCurrent));
    }
    if (indexArgs.pretokenized) {
        args.add("-pretokenized");
    }
    IndexCollection.main(args.toArray(new String[args.size()]));
}
Also used : IndexArgs(io.anserini.index.IndexArgs) ArrayList(java.util.ArrayList) Before(org.junit.Before)

Example 5 with IndexArgs

use of io.anserini.index.IndexArgs in project Anserini by castorini.

the class TrecEndToEndPassageTest method getIndexArgs.

@Override
protected IndexArgs getIndexArgs() {
    IndexArgs indexArgs = createDefaultIndexArgs();
    indexArgs.input = "src/test/resources/sample_docs/trec/collection3";
    indexArgs.collectionClass = TrecCollection.class.getSimpleName();
    return indexArgs;
}
Also used : TrecCollection(io.anserini.collection.TrecCollection) IndexArgs(io.anserini.index.IndexArgs)

Aggregations

IndexArgs (io.anserini.index.IndexArgs)22 TrecCollection (io.anserini.collection.TrecCollection)6 CoreCollection (io.anserini.collection.CoreCollection)3 JsonCollection (io.anserini.collection.JsonCollection)3 DefaultLuceneDocumentGenerator (io.anserini.index.generator.DefaultLuceneDocumentGenerator)3 Before (org.junit.Before)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)2 AclAnthology (io.anserini.collection.AclAnthology)2 AclAnthologyGenerator (io.anserini.index.generator.AclAnthologyGenerator)2 CoreGenerator (io.anserini.index.generator.CoreGenerator)2 BibtexCollection (io.anserini.collection.BibtexCollection)1 C4Collection (io.anserini.collection.C4Collection)1 JsonVectorCollection (io.anserini.collection.JsonVectorCollection)1 TweetCollection (io.anserini.collection.TweetCollection)1 IndexCollection (io.anserini.index.IndexCollection)1 BibtexGenerator (io.anserini.index.generator.BibtexGenerator)1 C4Generator (io.anserini.index.generator.C4Generator)1 TweetGenerator (io.anserini.index.generator.TweetGenerator)1 SearchSolr (io.anserini.search.SearchSolr)1