use of org.apache.nutch.indexer.NutchField in project nutch-elasticsearch-indexer by ctjmorgan.
the class ElasticsearchWriter method write.
@Override
public void write(NutchDocument doc) throws IOException {
// Set up the es index response
String uuid = UUID.randomUUID().toString();
IndexRequestBuilder response = client.prepareIndex("nutch", "index", uuid);
Map<String, Object> mp = new HashMap<String, Object>();
for (final Entry<String, NutchField> e : doc) {
for (final Object val : e.getValue().getValues()) {
String key;
// normalise the string representation for a Date
Object val2 = val;
if (val instanceof Date) {
key = e.getKey();
val2 = DateUtil.getThreadLocalDateFormat().format(val);
mp.put(key, val2);
} else {
key = e.getKey();
mp.put(key, val);
}
}
}
// insert the document into elasticsearch
response.setSource(mp);
response.execute();
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class JexlIndexingFilter method filter.
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Create a context and add data
JexlContext jcontext = new MapContext();
jcontext.set("status", CrawlDatum.getStatusName(datum.getStatus()));
jcontext.set("fetchTime", (long) (datum.getFetchTime()));
jcontext.set("modifiedTime", (long) (datum.getModifiedTime()));
jcontext.set("retries", datum.getRetriesSinceFetch());
jcontext.set("interval", new Integer(datum.getFetchInterval()));
jcontext.set("score", datum.getScore());
jcontext.set("signature", StringUtil.toHexString(datum.getSignature()));
jcontext.set("url", url.toString());
jcontext.set("text", parse.getText());
jcontext.set("title", parse.getData().getTitle());
JexlContext httpStatusContext = new MapContext();
httpStatusContext.set("majorCode", parse.getData().getStatus().getMajorCode());
httpStatusContext.set("minorCode", parse.getData().getStatus().getMinorCode());
httpStatusContext.set("message", parse.getData().getStatus().getMessage());
jcontext.set("httpStatus", httpStatusContext);
jcontext.set("documentMeta", metadataToContext(doc.getDocumentMeta()));
jcontext.set("contentMeta", metadataToContext(parse.getData().getContentMeta()));
jcontext.set("parseMeta", metadataToContext(parse.getData().getParseMeta()));
JexlContext context = new MapContext();
for (Entry<String, NutchField> entry : doc) {
context.set(entry.getKey(), entry.getValue().getValues());
}
jcontext.set("doc", context);
try {
if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
return doc;
}
} catch (Exception e) {
LOG.warn("Failed evaluating JEXL {}", expr.getExpression(), e);
}
return null;
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class ElasticIndexWriter method write.
@Override
public void write(NutchDocument doc) throws IOException {
String id = (String) doc.getFieldValue("id");
String type = doc.getDocumentMeta().get("type");
if (type == null)
type = "doc";
// Add each field of this doc to the index source
Map<String, Object> source = new HashMap<String, Object>();
for (final Map.Entry<String, NutchField> e : doc) {
final List<Object> values = e.getValue().getValues();
if (values.size() > 1) {
source.put(e.getKey(), values);
} else {
source.put(e.getKey(), values.get(0));
}
}
IndexRequest request = new IndexRequest(defaultIndex, type, id).source(source);
bulkProcessor.add(request);
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class SolrIndexWriter method write.
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
for (final Entry<String, NutchField> e : doc) {
for (final Object val : e.getValue().getValues()) {
// normalise the string representation for a Date
Object val2 = val;
if (val instanceof Date) {
val2 = DateUtil.getThreadLocalDateFormat().format(val);
}
if (e.getKey().equals("content") || e.getKey().equals("title")) {
val2 = SolrUtils.stripNonCharCodepoints((String) val);
}
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue().getWeight());
String sCopy = solrMapping.mapCopyKey(e.getKey());
if (sCopy != e.getKey()) {
inputDoc.addField(sCopy, val);
}
}
}
inputDoc.setDocumentBoost(doc.getWeight());
inputDocs.add(inputDoc);
totalAdds++;
if (inputDocs.size() + numDeletes >= batchSize) {
push();
}
}
use of org.apache.nutch.indexer.NutchField in project nutch by apache.
the class TLDScoringFilter method indexerScore.
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if (tlds != null) {
for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if (entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
Aggregations