Search in sources :

Example 6 with CompressionType

use of org.apache.hadoop.io.SequenceFile.CompressionType in project hadoop by apache.

the class MapFileOutputFormat method getRecordWriter.

public RecordWriter<WritableComparable, Writable> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
    // get the path of the temporary output file 
    Path file = FileOutputFormat.getTaskOutputPath(job, name);
    FileSystem fs = file.getFileSystem(job);
    CompressionCodec codec = null;
    CompressionType compressionType = CompressionType.NONE;
    if (getCompressOutput(job)) {
        // find the kind of compression to do
        compressionType = SequenceFileOutputFormat.getOutputCompressionType(job);
        // find the right codec
        Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, job);
    }
    // ignore the progress parameter, since MapFile is local
    final MapFile.Writer out = new MapFile.Writer(job, fs, file.toString(), job.getOutputKeyClass().asSubclass(WritableComparable.class), job.getOutputValueClass().asSubclass(Writable.class), compressionType, codec, progress);
    return new RecordWriter<WritableComparable, Writable>() {

        public void write(WritableComparable key, Writable value) throws IOException {
            out.append(key, value);
        }

        public void close(Reporter reporter) throws IOException {
            out.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) WritableComparable(org.apache.hadoop.io.WritableComparable) FileSystem(org.apache.hadoop.fs.FileSystem) Writable(org.apache.hadoop.io.Writable) MapFile(org.apache.hadoop.io.MapFile) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)

Example 7 with CompressionType

use of org.apache.hadoop.io.SequenceFile.CompressionType in project hadoop by apache.

the class TestSequenceFile method main.

/** For debugging and testing. */
public static void main(String[] args) throws Exception {
    int count = 1024 * 1024;
    int megabytes = 1;
    int factor = 10;
    boolean create = true;
    boolean rwonly = false;
    boolean check = false;
    boolean fast = false;
    boolean merge = false;
    String compressType = "NONE";
    String compressionCodec = "org.apache.hadoop.io.compress.DefaultCodec";
    Path file = null;
    int seed = new Random().nextInt();
    String usage = "Usage: testsequencefile " + "[-count N] " + "[-seed #] [-check] [-compressType <NONE|RECORD|BLOCK>] " + "-codec <compressionCodec> " + "[[-rwonly] | {[-megabytes M] [-factor F] [-nocreate] [-fast] [-merge]}] " + " file";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    FileSystem fs = null;
    try {
        for (int i = 0; i < args.length; ++i) {
            // parse command line
            if (args[i] == null) {
                continue;
            } else if (args[i].equals("-count")) {
                count = Integer.parseInt(args[++i]);
            } else if (args[i].equals("-megabytes")) {
                megabytes = Integer.parseInt(args[++i]);
            } else if (args[i].equals("-factor")) {
                factor = Integer.parseInt(args[++i]);
            } else if (args[i].equals("-seed")) {
                seed = Integer.parseInt(args[++i]);
            } else if (args[i].equals("-rwonly")) {
                rwonly = true;
            } else if (args[i].equals("-nocreate")) {
                create = false;
            } else if (args[i].equals("-check")) {
                check = true;
            } else if (args[i].equals("-fast")) {
                fast = true;
            } else if (args[i].equals("-merge")) {
                merge = true;
            } else if (args[i].equals("-compressType")) {
                compressType = args[++i];
            } else if (args[i].equals("-codec")) {
                compressionCodec = args[++i];
            } else {
                // file is required parameter
                file = new Path(args[i]);
            }
        }
        TestSequenceFile test = new TestSequenceFile();
        fs = file.getFileSystem(test.conf);
        LOG.info("count = " + count);
        LOG.info("megabytes = " + megabytes);
        LOG.info("factor = " + factor);
        LOG.info("create = " + create);
        LOG.info("seed = " + seed);
        LOG.info("rwonly = " + rwonly);
        LOG.info("check = " + check);
        LOG.info("fast = " + fast);
        LOG.info("merge = " + merge);
        LOG.info("compressType = " + compressType);
        LOG.info("compressionCodec = " + compressionCodec);
        LOG.info("file = " + file);
        if (rwonly && (!create || merge || fast)) {
            System.err.println(usage);
            System.exit(-1);
        }
        CompressionType compressionType = CompressionType.valueOf(compressType);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(test.conf.getClassByName(compressionCodec), test.conf);
        if (rwonly || (create && !merge)) {
            test.writeTest(fs, count, seed, file, compressionType, codec);
            test.readTest(fs, count, seed, file);
        }
        if (!rwonly) {
            if (merge) {
                test.mergeTest(fs, count, seed, file, compressionType, fast, factor, megabytes);
            } else {
                test.sortTest(fs, count, megabytes, factor, fast, file);
            }
        }
        if (check) {
            test.checkSort(fs, count, seed, file);
        }
    } finally {
        if (fs != null) {
            fs.close();
        }
    }
}
Also used : CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)

Example 8 with CompressionType

use of org.apache.hadoop.io.SequenceFile.CompressionType in project hadoop by apache.

the class SequenceFileOutputFormat method getSequenceWriter.

protected SequenceFile.Writer getSequenceWriter(TaskAttemptContext context, Class<?> keyClass, Class<?> valueClass) throws IOException {
    Configuration conf = context.getConfiguration();
    CompressionCodec codec = null;
    CompressionType compressionType = CompressionType.NONE;
    if (getCompressOutput(context)) {
        // find the kind of compression to do
        compressionType = getOutputCompressionType(context);
        // find the right codec
        Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
    }
    // get the path of the temporary output file 
    Path file = getDefaultWorkFile(context, "");
    FileSystem fs = file.getFileSystem(conf);
    return SequenceFile.createWriter(fs, conf, file, keyClass, valueClass, compressionType, codec, context);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)

Example 9 with CompressionType

use of org.apache.hadoop.io.SequenceFile.CompressionType in project nutch by apache.

the class FetcherOutputFormat method getRecordWriter.

@Override
public RecordWriter<Text, NutchWritable> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part", "");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    Path out = FileOutputFormat.getOutputPath(context);
    final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
    final Path content = new Path(new Path(out, Content.DIR_NAME), name);
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer fetchOut = new MapFile.Writer(conf, fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
    return new RecordWriter<Text, NutchWritable>() {

        private MapFile.Writer contentOut;

        private RecordWriter<Text, Parse> parseOut;

        {
            if (Fetcher.isStoringContent(conf)) {
                Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
                org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
                org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
                contentOut = new MapFile.Writer(conf, content, cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
            }
            if (Fetcher.isParsing(conf)) {
                parseOut = new ParseOutputFormat().getRecordWriter(context);
            }
        }

        public void write(Text key, NutchWritable value) throws IOException, InterruptedException {
            Writable w = value.get();
            if (w instanceof CrawlDatum)
                fetchOut.append(key, w);
            else if (w instanceof Content && contentOut != null)
                contentOut.append(key, w);
            else if (w instanceof Parse && parseOut != null)
                parseOut.write(key, (Parse) w);
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            fetchOut.close();
            if (contentOut != null) {
                contentOut.close();
            }
            if (parseOut != null) {
                parseOut.close(context);
            }
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Parse(org.apache.nutch.parse.Parse) MapFile(org.apache.hadoop.io.MapFile) NutchWritable(org.apache.nutch.crawl.NutchWritable) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) Content(org.apache.nutch.protocol.Content) FileSystem(org.apache.hadoop.fs.FileSystem) ParseOutputFormat(org.apache.nutch.parse.ParseOutputFormat) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Example 10 with CompressionType

use of org.apache.hadoop.io.SequenceFile.CompressionType in project nutch by apache.

the class ParseOutputFormat method getRecordWriter.

public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String name = getUniqueFile(context, "part");
    Path dir = FileOutputFormat.getOutputPath(context);
    FileSystem fs = dir.getFileSystem(context.getConfiguration());
    if (conf.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(conf);
        exemptionFilters = new URLExemptionFilters(conf);
    }
    if (conf.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
    }
    this.scfilters = new ScoringFilters(conf);
    final int interval = conf.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
    final boolean ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
    final String ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
    // NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to
    // store 'parse_text' directory or not:
    final boolean storeText = conf.getBoolean("parser.store.text", true);
    int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = conf.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
    Path out = FileOutputFormat.getOutputPath(context);
    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "").split(" *, *");
    // textOut Options
    final MapFile.Writer textOut;
    if (storeText) {
        Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
        org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
        org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
        textOut = new MapFile.Writer(conf, text, tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
    } else {
        textOut = null;
    }
    // dataOut Options
    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable((Progressable) context);
    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
    final MapFile.Writer dataOut = new MapFile.Writer(conf, data, dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(conf, SequenceFile.Writer.file(crawl), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(CrawlDatum.class), SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size", 4096)), SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)), SequenceFile.Writer.blockSize(1073741824), SequenceFile.Writer.compression(compType, new DefaultCodec()), SequenceFile.Writer.progressable((Progressable) context), SequenceFile.Writer.metadata(new Metadata()));
    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {
            String fromUrl = key.toString();
            // host or domain name of the source URL
            String origin = null;
            if (textOut != null) {
                textOut.append(key, new ParseText(parse.getText()));
            }
            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }
            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);
            // need to determine origin (once for all outlinks)
            if (ignoreExternalLinks || ignoreInternalLinks) {
                URL originURL = new URL(fromUrl.toString());
                // based on domain?
                if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                    origin = URLUtil.getDomainName(originURL).toLowerCase();
                } else // use host
                {
                    origin = originURL.getHost().toLowerCase();
                }
            }
            ParseStatus pstatus = parseData.getStatus();
            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                String newUrl = pstatus.getMessage();
                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                newUrl = filterNormalize(fromUrl, newUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_FETCHER);
                if (newUrl != null) {
                    String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME);
                    CrawlDatum newDatum = new CrawlDatum();
                    newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                    if (reprUrl != null && !reprUrl.equals(newUrl)) {
                        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                    }
                    crawlOut.append(new Text(newUrl), newDatum);
                }
            }
            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();
                // only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }
                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                // see if the outlink has any metadata attached
                // and if so pass that to the crawldatum so that
                // the initial score or distribution can use that
                MapWritable outlinkMD = links[i].getMetadata();
                if (outlinkMD != null) {
                    target.getMetaData().putAll(outlinkMD);
                }
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }
                targets.add(new SimpleEntry(targetUrl, target));
                // overwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }
            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);
            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(TaskAttemptContext context) throws IOException {
            if (textOut != null)
                textOut.close();
            dataOut.close();
            crawlOut.close();
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Metadata(org.apache.hadoop.io.SequenceFile.Metadata) ArrayList(java.util.ArrayList) MapFile(org.apache.hadoop.io.MapFile) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) URL(java.net.URL) Entry(java.util.Map.Entry) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) URLFilters(org.apache.nutch.net.URLFilters) Path(org.apache.hadoop.fs.Path) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) MapWritable(org.apache.hadoop.io.MapWritable) MalformedURLException(java.net.MalformedURLException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) IOException(java.io.IOException) Progressable(org.apache.hadoop.util.Progressable) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) Option(org.apache.hadoop.io.MapFile.Writer.Option) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType) URLExemptionFilters(org.apache.nutch.net.URLExemptionFilters) URLNormalizers(org.apache.nutch.net.URLNormalizers) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter)

Aggregations

CompressionType (org.apache.hadoop.io.SequenceFile.CompressionType)11 FileSystem (org.apache.hadoop.fs.FileSystem)7 Path (org.apache.hadoop.fs.Path)7 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)7 Configuration (org.apache.hadoop.conf.Configuration)4 MapFile (org.apache.hadoop.io.MapFile)4 SequenceFile (org.apache.hadoop.io.SequenceFile)4 Writable (org.apache.hadoop.io.Writable)3 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)3 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)3 IOException (java.io.IOException)2 Option (org.apache.hadoop.io.MapFile.Writer.Option)2 Text (org.apache.hadoop.io.Text)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)2 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)2 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 FileSystemNotFoundException (java.nio.file.FileSystemNotFoundException)1 ArrayList (java.util.ArrayList)1