use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class TestLinksIndexingFilter method testIndexOnlyHostPart.
@Test
public void testIndexOnlyHostPart() throws Exception {
conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
filter.setConf(conf);
Outlink[] outlinks = generateOutlinks(true);
Inlinks inlinks = new Inlinks();
inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
NutchField docOutlinks = doc.getField("outlinks");
Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class ReplaceIndexer method doReplace.
/**
* Iterates through the replacement map provided, to update the fields in the
* Nutch Document.
*
* @param doc
* the document we are modifying
* @param keyName
* either "host" or "url" -- the field that determines the
* replacement set used
* @param replaceMap
* the list of FieldReplacers that applies to this keyName.
*/
private void doReplace(NutchDocument doc, String keyName, Map<Pattern, List<FieldReplacer>> replaceMap) {
if (doc == null || replaceMap.size() == 0) {
return;
}
Collection<String> docFieldNames = doc.getFieldNames();
NutchField keyField = doc.getField(keyName);
if (keyField == null) {
// This document doesn't have the key field; no work to do.
return;
}
List<Object> keyFieldValues = keyField.getValues();
if (keyFieldValues.size() == 0) {
// This document doesn't have any values for the key field; no work to do.
return;
}
// For every value of the keyField (one expected)
for (Object oKeyFieldValue : keyFieldValues) {
if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
String keyFieldValue = (String) oKeyFieldValue;
// For each pattern that we have a replacement list for...
for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap.entrySet()) {
// If this key is a match for a replacement set...
if (entries.getKey().matcher(keyFieldValue).find()) {
// For each field we will replace for this key...
for (FieldReplacer fp : entries.getValue()) {
String fieldName = fp.getFieldName();
// Does this document contain the FieldReplacer's field?
if (docFieldNames.contains(fieldName)) {
NutchField docField = doc.getField(fieldName);
List<Object> fieldValues = docField.getValues();
ArrayList<String> newFieldValues = new ArrayList<String>();
// replacer...
for (Object oFieldValue : fieldValues) {
if (oFieldValue != null && oFieldValue instanceof java.lang.String) {
String fieldValue = (String) oFieldValue;
String newValue = fp.replace(fieldValue);
newFieldValues.add(newValue);
}
}
// Remove the target field and add our replaced values.
String targetFieldName = fp.getToFieldName();
doc.removeField(targetFieldName);
for (String newFieldValue : newFieldValues) {
doc.add(targetFieldName, newFieldValue);
}
}
}
}
}
}
}
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class CloudSearchIndexWriter method write.
@Override
public void write(NutchDocument doc) throws IOException {
try {
JSONObject doc_builder = new JSONObject();
doc_builder.put("type", "add");
String url = doc.getField("url").toString();
// generate the id from the url
String ID = CloudSearchUtils.getID(url);
doc_builder.put("id", ID);
JSONObject fields = new JSONObject();
for (final Entry<String, NutchField> e : doc) {
String fieldname = cleanFieldName(e.getKey());
String type = csfields.get(fieldname);
// undefined in index
if (!dumpBatchFilesToTemp && type == null) {
LOG.info("Field {} not defined in CloudSearch domain for {} - skipping.", fieldname, url);
continue;
}
List<Object> values = e.getValue().getValues();
// write the values
for (Object value : values) {
// Convert dates to an integer
if (value instanceof Date) {
Date d = (Date) value;
value = DATE_FORMAT.format(d);
} else // normalise strings
if (value instanceof String) {
value = CloudSearchUtils.stripNonCharCodepoints((String) value);
}
fields.accumulate(fieldname, value);
}
}
doc_builder.put("fields", fields);
addToBatch(doc_builder.toString(2), url);
} catch (JSONException e) {
LOG.error("Exception caught while building JSON object", e);
}
}
Aggregations