Search in sources :

Example 1 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch-elasticsearch-indexer by ctjmorgan.

the class ElasticsearchWriter method write.

@Override
public void write(NutchDocument doc) throws IOException {
    // Set up the es index response 
    String uuid = UUID.randomUUID().toString();
    IndexRequestBuilder response = client.prepareIndex("nutch", "index", uuid);
    Map<String, Object> mp = new HashMap<String, Object>();
    for (final Entry<String, NutchField> e : doc) {
        for (final Object val : e.getValue().getValues()) {
            String key;
            // normalise the string representation for a Date
            Object val2 = val;
            if (val instanceof Date) {
                key = e.getKey();
                val2 = DateUtil.getThreadLocalDateFormat().format(val);
                mp.put(key, val2);
            } else {
                key = e.getKey();
                mp.put(key, val);
            }
        }
    }
    // insert the document into elasticsearch
    response.setSource(mp);
    response.execute();
}
Also used : IndexRequestBuilder(org.elasticsearch.client.action.index.IndexRequestBuilder) NutchField(org.apache.nutch.indexer.NutchField) HashMap(java.util.HashMap) Date(java.util.Date)

Example 2 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class JexlIndexingFilter method filter.

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    // Create a context and add data
    JexlContext jcontext = new MapContext();
    jcontext.set("status", CrawlDatum.getStatusName(datum.getStatus()));
    jcontext.set("fetchTime", (long) (datum.getFetchTime()));
    jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
    jcontext.set("retries", datum.getRetriesSinceFetch());
    jcontext.set("interval", new Integer(datum.getFetchInterval()));
    jcontext.set("score", datum.getScore());
    jcontext.set("signature", StringUtil.toHexString(datum.getSignature()));
    jcontext.set("url", url.toString());
    jcontext.set("text", parse.getText());
    jcontext.set("title", parse.getData().getTitle());
    JexlContext httpStatusContext = new MapContext();
    httpStatusContext.set("majorCode", parse.getData().getStatus().getMajorCode());
    httpStatusContext.set("minorCode", parse.getData().getStatus().getMinorCode());
    httpStatusContext.set("message", parse.getData().getStatus().getMessage());
    jcontext.set("httpStatus", httpStatusContext);
    jcontext.set("documentMeta", metadataToContext(doc.getDocumentMeta()));
    jcontext.set("contentMeta", metadataToContext(parse.getData().getContentMeta()));
    jcontext.set("parseMeta", metadataToContext(parse.getData().getParseMeta()));
    JexlContext context = new MapContext();
    for (Entry<String, NutchField> entry : doc) {
        context.set(entry.getKey(), entry.getValue().getValues());
    }
    jcontext.set("doc", context);
    try {
        if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
            return doc;
        }
    } catch (Exception e) {
        LOG.warn("Failed evaluating JEXL {}", expr.getExpression(), e);
    }
    return null;
}
Also used : NutchField(org.apache.nutch.indexer.NutchField) JexlContext(org.apache.commons.jexl2.JexlContext) MapContext(org.apache.commons.jexl2.MapContext) IndexingException(org.apache.nutch.indexer.IndexingException)

Example 3 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class ElasticIndexWriter method write.

@Override
public void write(NutchDocument doc) throws IOException {
    String id = (String) doc.getFieldValue("id");
    String type = doc.getDocumentMeta().get("type");
    if (type == null)
        type = "doc";
    // Add each field of this doc to the index source
    Map<String, Object> source = new HashMap<String, Object>();
    for (final Map.Entry<String, NutchField> e : doc) {
        final List<Object> values = e.getValue().getValues();
        if (values.size() > 1) {
            source.put(e.getKey(), values);
        } else {
            source.put(e.getKey(), values.get(0));
        }
    }
    IndexRequest request = new IndexRequest(defaultIndex, type, id).source(source);
    bulkProcessor.add(request);
}
Also used : NutchField(org.apache.nutch.indexer.NutchField) HashMap(java.util.HashMap) IndexRequest(org.elasticsearch.action.index.IndexRequest) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class SolrIndexWriter method write.

public void write(NutchDocument doc) throws IOException {
    final SolrInputDocument inputDoc = new SolrInputDocument();
    for (final Entry<String, NutchField> e : doc) {
        for (final Object val : e.getValue().getValues()) {
            // normalise the string representation for a Date
            Object val2 = val;
            if (val instanceof Date) {
                val2 = DateUtil.getThreadLocalDateFormat().format(val);
            }
            if (e.getKey().equals("content") || e.getKey().equals("title")) {
                val2 = SolrUtils.stripNonCharCodepoints((String) val);
            }
            inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue().getWeight());
            String sCopy = solrMapping.mapCopyKey(e.getKey());
            if (sCopy != e.getKey()) {
                inputDoc.addField(sCopy, val);
            }
        }
    }
    inputDoc.setDocumentBoost(doc.getWeight());
    inputDocs.add(inputDoc);
    totalAdds++;
    if (inputDocs.size() + numDeletes >= batchSize) {
        push();
    }
}
Also used : SolrInputDocument(org.apache.solr.common.SolrInputDocument) NutchField(org.apache.nutch.indexer.NutchField) Date(java.util.Date)

Example 5 with NutchField

use of org.apache.nutch.indexer.NutchField in project nutch by apache.

the class TLDScoringFilter method indexerScore.

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
    NutchField tlds = doc.getField("tld");
    float boost = 1.0f;
    if (tlds != null) {
        for (Object tld : tlds.getValues()) {
            DomainSuffix entry = tldEntries.get(tld.toString());
            if (entry != null)
                boost *= entry.getBoost();
        }
    }
    return initScore * boost;
}
Also used : NutchField(org.apache.nutch.indexer.NutchField) DomainSuffix(org.apache.nutch.util.domain.DomainSuffix)

Aggregations

NutchField (org.apache.nutch.indexer.NutchField)8 Date (java.util.Date)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 JSONException (com.amazonaws.util.json.JSONException)1 JSONObject (com.amazonaws.util.json.JSONObject)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1 Pattern (java.util.regex.Pattern)1 JexlContext (org.apache.commons.jexl2.JexlContext)1 MapContext (org.apache.commons.jexl2.MapContext)1 Text (org.apache.hadoop.io.Text)1 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)1 Inlink (org.apache.nutch.crawl.Inlink)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 IndexingException (org.apache.nutch.indexer.IndexingException)1 NutchDocument (org.apache.nutch.indexer.NutchDocument)1 Outlink (org.apache.nutch.parse.Outlink)1