use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestElasticIndexWriter method testBackoffPolicy.
@Test
public void testBackoffPolicy() throws IOException {
// set a non-zero "max-retry" value, **implying the cluster is saturated**
maxNumFailures = 5;
conf.setInt(ElasticConstants.EXPONENTIAL_BACKOFF_RETRIES, maxNumFailures);
int numDocs = 10;
conf.setInt(ElasticConstants.MAX_BULK_DOCS, numDocs);
Job job = Job.getInstance(conf);
testIndexWriter.setConf(conf);
testIndexWriter.open(conf, "name");
NutchDocument doc = new NutchDocument();
doc.add("id", "http://www.example.com");
// pretend the remote cluster is "saturated"
clusterSaturated = true;
Assert.assertFalse(bulkRequestSuccessful);
// write enough docs to initiate one bulk request
for (int i = 0; i < numDocs; i++) {
testIndexWriter.write(doc);
}
testIndexWriter.close();
// the BulkProcessor should have retried `maxNumFailures + 1` times, then succeeded
Assert.assertTrue(bulkRequestSuccessful);
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestElasticIndexWriter method testBulkMaxLength.
@Test
public void testBulkMaxLength() throws IOException {
String key = "id";
String value = "http://www.example.com";
int defaultMaxBulkLength = conf.getInt(ElasticConstants.MAX_BULK_LENGTH, 2500500);
// Test that MAX_BULK_LENGTH is respected by lowering it 10x
int testMaxBulkLength = defaultMaxBulkLength / 10;
// This number is somewhat arbitrary, but must be a function of:
// - testMaxBulkLength
// - approximate size of each doc
int numDocs = testMaxBulkLength / (key.length() + value.length());
conf.setInt(ElasticConstants.MAX_BULK_LENGTH, testMaxBulkLength);
Job job = Job.getInstance(conf);
testIndexWriter.setConf(conf);
testIndexWriter.open(conf, "name");
NutchDocument doc = new NutchDocument();
doc.add(key, value);
Assert.assertFalse(bulkRequestSuccessful);
for (int i = 0; i < numDocs; i++) {
testIndexWriter.write(doc);
}
testIndexWriter.close();
Assert.assertTrue(bulkRequestSuccessful);
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class MimeTypeIndexingFilterTest method testAllowOnlyImages.
@Test
public void testAllowOnlyImages() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("image")) {
Assert.assertNotNull("Allow only images", doc);
} else {
Assert.assertNull("Block everything else", doc);
}
}
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class MimeTypeIndexingFilterTest method testBlockHTML.
@Test
public void testBlockHTML() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("html")) {
Assert.assertNull("Block only HTML documents", doc);
} else {
Assert.assertNotNull("Allow everything else", doc);
}
}
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class MimeTypeIndexingFilterTest method testMissingConfigFile.
@Test
public void testMissingConfigFile() throws Exception {
String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "");
Assert.assertEquals(String.format("Property %s must not be present in the the configuration file", MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file);
filter.setConf(conf);
// property not set so in this cases all documents must pass the filter
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNotNull("All documents must be allowed by default", doc);
}
}
Aggregations