Search in sources :

Example 11 with MapWritable

use of org.apache.hadoop.io.MapWritable in project nutch by apache.

the class DOMContentUtils method getOutlinks.

/**
 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
 *
 * @param base the canonical {@link URL}
 * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated
 * with the base URL
 * @param node a {@link Node} under which to discover anchors
 */
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    if (linkText.toString().trim().length() == 0) {
                        // try harder - use img alt if present
                        NodeWalker subWalker = new NodeWalker(currentNode);
                        while (subWalker.hasNext()) {
                            Node subNode = subWalker.nextNode();
                            if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                                if (subNode.getNodeName().toLowerCase().equals("img")) {
                                    NamedNodeMap subAttrs = subNode.getAttributes();
                                    Node alt = subAttrs.getNamedItem("alt");
                                    if (alt != null) {
                                        String altTxt = alt.getTextContent();
                                        if (altTxt != null && altTxt.trim().length() > 0) {
                                            if (linkText.length() > 0)
                                                linkText.append(' ');
                                            linkText.append(altTxt);
                                        }
                                    }
                                } else {
                                // ignore other types of elements
                                }
                            } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                                String txt = subNode.getTextContent();
                                if (txt != null && txt.length() > 0) {
                                    if (linkText.length() > 0)
                                        linkText.append(' ');
                                    linkText.append(txt);
                                }
                            }
                        }
                    }
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                        }
                    }
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            outlinks.add(outlink);
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                                outlink.setMetadata(metadata);
                            }
                        } catch (MalformedURLException e) {
                        // don't care
                        }
                }
                // this should not have any children, skip them
                if (params.childLen == 0)
                    continue;
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) NodeWalker(org.apache.nutch.util.NodeWalker) URL(java.net.URL)

Example 12 with MapWritable

use of org.apache.hadoop.io.MapWritable in project nutch by apache.

the class DOMContentUtils method getOutlinks.

/**
 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
 *
 * @param base the canonical {@link URL}
 * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated
 * with the base URL
 * @param node a {@link Node} under which to discover anchors
 */
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                        }
                    }
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            outlinks.add(outlink);
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                                outlink.setMetadata(metadata);
                            }
                        } catch (MalformedURLException e) {
                        // don't care
                        }
                }
                // this should not have any children, skip them
                if (params.childLen == 0)
                    continue;
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) NodeWalker(org.apache.nutch.util.NodeWalker) URL(java.net.URL)

Example 13 with MapWritable

use of org.apache.hadoop.io.MapWritable in project nutch by apache.

the class ParseOutputFormat method getRecordWriter.

@Override
public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    if (conf.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(conf);
        exemptionFilters = new URLExemptionFilters(conf);
    }
    if (conf.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    }
    this.scfilters = new ScoringFilters(conf);
    final int interval = conf.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
    final boolean ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
    final String ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
    // NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to
    // store 'parse_text' directory or not:
    final boolean storeText = conf.getBoolean("parser.store.text", true);
    int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    int maxOutlinkL = conf.getInt("db.max.outlink.length", 4096);
    final int maxOutlinkLength = (maxOutlinkL < 0) ? Integer.MAX_VALUE : maxOutlinkL;
    final boolean isParsing = conf.getBoolean("fetcher.parse", true);
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Path out = FileOutputFormat.getOutputPath(context);
    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "").split(" *, *");
    // textOut Options
    final MapFile.Writer textOut;
    if (storeText) {
        Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
        org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
        textOut = new MapFile.Writer(conf, text, tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
    } else {
        textOut = null;
    }
    // dataOut Options
    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer dataOut = new MapFile.Writer(conf, data, dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(conf, SequenceFile.Writer.file(crawl), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(CrawlDatum.class), SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size", 4096)), SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)), SequenceFile.Writer.blockSize(1073741824), SequenceFile.Writer.compression(compType, new DefaultCodec()), SequenceFile.Writer.progressable((Progressable) context), SequenceFile.Writer.metadata(new Metadata()));
    return new RecordWriter<Text, Parse>() {

        @Override
        public void write(Text key, Parse parse) throws IOException {
            String fromUrl = key.toString();
            // host or domain name of the source URL
            String origin = null;
            if (textOut != null) {
                textOut.append(key, new ParseText(parse.getText()));
            }
            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }
            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);
            // need to determine origin (once for all outlinks)
            if (ignoreExternalLinks || ignoreInternalLinks) {
                URL originURL = new URL(fromUrl.toString());
                // based on domain?
                if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                    origin = URLUtil.getDomainName(originURL).toLowerCase();
                } else // use host
                {
                    origin = originURL.getHost().toLowerCase();
                }
            }
            ParseStatus pstatus = parseData.getStatus();
            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                String newUrl = pstatus.getMessage();
                int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
                newUrl = filterNormalize(fromUrl, newUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_FETCHER);
                if (newUrl != null) {
                    String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME);
                    CrawlDatum newDatum = new CrawlDatum();
                    newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                    if (reprUrl != null && !reprUrl.equals(newUrl)) {
                        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                    }
                    crawlOut.append(new Text(newUrl), newDatum);
                }
            }
            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();
                // only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    if (toUrl.length() > maxOutlinkLength) {
                        continue;
                    }
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }
                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                // see if the outlink has any metadata attached
                // and if so pass that to the crawldatum so that
                // the initial score or distribution can use that
                MapWritable outlinkMD = links[i].getMetadata();
                if (outlinkMD != null) {
                    target.getMetaData().putAll(outlinkMD);
                }
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }
                targets.add(new SimpleEntry(targetUrl, target));
                // overwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }
            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);
            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException {
            if (textOut != null)
                textOut.close();
            dataOut.close();
            crawlOut.close();
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Metadata(org.apache.hadoop.io.SequenceFile.Metadata) ArrayList(java.util.ArrayList) MapFile(org.apache.hadoop.io.MapFile) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) URL(java.net.URL) Entry(java.util.Map.Entry) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) URLFilters(org.apache.nutch.net.URLFilters) Path(org.apache.hadoop.fs.Path) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) MapWritable(org.apache.hadoop.io.MapWritable) MalformedURLException(java.net.MalformedURLException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) IOException(java.io.IOException) Progressable(org.apache.hadoop.util.Progressable) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) URLExemptionFilters(org.apache.nutch.net.URLExemptionFilters) URLNormalizers(org.apache.nutch.net.URLNormalizers) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 14 with MapWritable

use of org.apache.hadoop.io.MapWritable in project wonderdog by infochimps-labs.

the class ElasticSearchIndex method putNext.

/**
 *       Map a tuple object into a map-writable object for elasticsearch.
 */
@SuppressWarnings("unchecked")
@Override
public void putNext(Tuple t) throws IOException {
    UDFContext context = UDFContext.getUDFContext();
    Properties property = context.getUDFProperties(ResourceSchema.class);
    MapWritable record = new MapWritable();
    String[] fieldNames = property.getProperty(PIG_ES_FIELD_NAMES).split(COMMA);
    for (int i = 0; i < t.size(); i++) {
        if (i < fieldNames.length) {
            try {
                record.put(new Text(fieldNames[i]), new Text(t.get(i).toString()));
            } catch (NullPointerException e) {
            // LOG.info("Increment null field counter.");
            }
        }
    }
    try {
        writer.write(NullWritable.get(), record);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}
Also used : UDFContext(org.apache.pig.impl.util.UDFContext) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) IOException(java.io.IOException) Properties(java.util.Properties)

Example 15 with MapWritable

use of org.apache.hadoop.io.MapWritable in project gora by apache.

the class WritableUtils method writeProperties.

public static final void writeProperties(DataOutput out, Properties props) throws IOException {
    MapWritable propsWritable = new MapWritable();
    for (Entry<Object, Object> prop : props.entrySet()) {
        Writable key = new Text(prop.getKey().toString());
        Writable value = new Text(prop.getValue().toString());
        propsWritable.put(key, value);
    }
    propsWritable.write(out);
}
Also used : Writable(org.apache.hadoop.io.Writable) MapWritable(org.apache.hadoop.io.MapWritable) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable)

Aggregations

MapWritable (org.apache.hadoop.io.MapWritable)18 Text (org.apache.hadoop.io.Text)10 Writable (org.apache.hadoop.io.Writable)9 BytesWritable (org.apache.hadoop.io.BytesWritable)5 IOException (java.io.IOException)4 MalformedURLException (java.net.MalformedURLException)3 URL (java.net.URL)3 ArrayWritable (org.apache.hadoop.io.ArrayWritable)3 BooleanWritable (org.apache.hadoop.io.BooleanWritable)3 LongWritable (org.apache.hadoop.io.LongWritable)3 NullWritable (org.apache.hadoop.io.NullWritable)3 Test (org.junit.Test)3 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Properties (java.util.Properties)2 InputTableConfig (org.apache.accumulo.core.client.mapreduce.InputTableConfig)2 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)2 ByteWritable (org.apache.hadoop.io.ByteWritable)2 DoubleWritable (org.apache.hadoop.io.DoubleWritable)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2