Search in sources :

Example 6 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

@Test
public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 7 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class ReplaceIndexer method doReplace.

/**
 * Iterates through the replacement map provided, to update the fields in the
 * Nutch Document.
 *
 * @param doc
 *          the document we are modifying
 * @param keyName
 *          either "host" or "url" -- the field that determines the
 *          replacement set used
 * @param replaceMap
 *          the list of FieldReplacers that applies to this keyName.
 */
private void doReplace(NutchDocument doc, String keyName, Map<Pattern, List<FieldReplacer>> replaceMap) {
    if (doc == null || replaceMap.size() == 0) {
        return;
    }
    Collection<String> docFieldNames = doc.getFieldNames();
    NutchField keyField = doc.getField(keyName);
    if (keyField == null) {
        // This document doesn't have the key field; no work to do.
        return;
    }
    List<Object> keyFieldValues = keyField.getValues();
    if (keyFieldValues.size() == 0) {
        // This document doesn't have any values for the key field; no work to do.
        return;
    }
    // For every value of the keyField (one expected)
    for (Object oKeyFieldValue : keyFieldValues) {
        if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
            String keyFieldValue = (String) oKeyFieldValue;
            // For each pattern that we have a replacement list for...
            for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap.entrySet()) {
                // If this key is a match for a replacement set...
                if (entries.getKey().matcher(keyFieldValue).find()) {
                    // For each field we will replace for this key...
                    for (FieldReplacer fp : entries.getValue()) {
                        String fieldName = fp.getFieldName();
                        // Does this document contain the FieldReplacer's field?
                        if (docFieldNames.contains(fieldName)) {
                            NutchField docField = doc.getField(fieldName);
                            List<Object> fieldValues = docField.getValues();
                            ArrayList<String> newFieldValues = new ArrayList<String>();
                            // replacer...
                            for (Object oFieldValue : fieldValues) {
                                if (oFieldValue != null && oFieldValue instanceof java.lang.String) {
                                    String fieldValue = (String) oFieldValue;
                                    String newValue = fp.replace(fieldValue);
                                    newFieldValues.add(newValue);
                                }
                            }
                            // Remove the target field and add our replaced values.
                            String targetFieldName = fp.getToFieldName();
                            doc.removeField(targetFieldName);
                            for (String newFieldValue : newFieldValues) {
                                doc.add(targetFieldName, newFieldValue);
                            }
                        }
                    }
                }
            }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) ArrayList(java.util.ArrayList) NutchField(org.apache.nutch.indexer.NutchField) ArrayList(java.util.ArrayList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 8 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class CloudSearchIndexWriter method write.

@Override
public void write(NutchDocument doc) throws IOException {
    try {
        JSONObject doc_builder = new JSONObject();
        doc_builder.put("type", "add");
        String url = doc.getField("url").toString();
        // generate the id from the url
        String ID = CloudSearchUtils.getID(url);
        doc_builder.put("id", ID);
        JSONObject fields = new JSONObject();
        for (final Entry<String, NutchField> e : doc) {
            String fieldname = cleanFieldName(e.getKey());
            String type = csfields.get(fieldname);
            // undefined in index
            if (!dumpBatchFilesToTemp && type == null) {
                LOG.info("Field {} not defined in CloudSearch domain for {} - skipping.", fieldname, url);
                continue;
            }
            List<Object> values = e.getValue().getValues();
            // write the values
            for (Object value : values) {
                // Convert dates to an integer
                if (value instanceof Date) {
                    Date d = (Date) value;
                    value = DATE_FORMAT.format(d);
                } else // normalise strings
                if (value instanceof String) {
                    value = CloudSearchUtils.stripNonCharCodepoints((String) value);
                }
                fields.accumulate(fieldname, value);
            }
        }
        doc_builder.put("fields", fields);
        addToBatch(doc_builder.toString(2), url);
    } catch (JSONException e) {
        LOG.error("Exception caught while building JSON object", e);
    }
}
Also used : JSONObject(com.amazonaws.util.json.JSONObject) NutchField(org.apache.nutch.indexer.NutchField) JSONException(com.amazonaws.util.json.JSONException) JSONObject(com.amazonaws.util.json.JSONObject) Date(java.util.Date)

Aggregations

NutchField (org.apache.nutch.indexer.NutchField)8 Date (java.util.Date)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 JSONException (com.amazonaws.util.json.JSONException)1 JSONObject (com.amazonaws.util.json.JSONObject)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Pattern (java.util.regex.Pattern)1 JexlContext (org.apache.commons.jexl2.JexlContext)1 MapContext (org.apache.commons.jexl2.MapContext)1 Text (org.apache.hadoop.io.Text)1 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)1 Inlink (org.apache.nutch.crawl.Inlink)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 IndexingException (org.apache.nutch.indexer.IndexingException)1 NutchDocument (org.apache.nutch.indexer.NutchDocument)1 Outlink (org.apache.nutch.parse.Outlink)1