Search in sources :

Example 6 with PairOfWritables

use of tl.lin.data.pair.PairOfWritables in project Cloud9 by lintool.

the class AnalyzeBigramRelativeFrequencyTuple method main.

@SuppressWarnings({ "static-access" })
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (!cmdline.hasOption(INPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(AnalyzeBigramRelativeFrequencyJson.class.getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.exit(-1);
    }
    String inputPath = cmdline.getOptionValue(INPUT);
    System.out.println("input path: " + inputPath);
    List<PairOfWritables<Tuple, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(inputPath));
    List<PairOfWritables<Tuple, FloatWritable>> list1 = Lists.newArrayList();
    List<PairOfWritables<Tuple, FloatWritable>> list2 = Lists.newArrayList();
    for (PairOfWritables<Tuple, FloatWritable> p : pairs) {
        Tuple bigram = p.getLeftElement();
        if (bigram.get(0).equals("light")) {
            list1.add(p);
        }
        if (bigram.get(0).equals("contain")) {
            list2.add(p);
        }
    }
    Collections.sort(list1, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {

        @SuppressWarnings("unchecked")
        public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<Tuple, FloatWritable>> iter1 = Iterators.limit(list1.iterator(), 10);
    while (iter1.hasNext()) {
        PairOfWritables<Tuple, FloatWritable> p = iter1.next();
        Tuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
    Collections.sort(list2, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {

        @SuppressWarnings("unchecked")
        public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }
            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });
    Iterator<PairOfWritables<Tuple, FloatWritable>> iter2 = Iterators.limit(list2.iterator(), 10);
    while (iter2.hasNext()) {
        PairOfWritables<Tuple, FloatWritable> p = iter2.next();
        Tuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Options(org.apache.commons.cli.Options) GnuParser(org.apache.commons.cli.GnuParser) HelpFormatter(org.apache.commons.cli.HelpFormatter) CommandLine(org.apache.commons.cli.CommandLine) FloatWritable(org.apache.hadoop.io.FloatWritable) PairOfWritables(tl.lin.data.pair.PairOfWritables) CommandLineParser(org.apache.commons.cli.CommandLineParser) ParseException(org.apache.commons.cli.ParseException) Tuple(org.apache.pig.data.Tuple)

Example 7 with PairOfWritables

use of tl.lin.data.pair.PairOfWritables in project Cloud9 by lintool.

the class LookupPostings method lookupTerm.

public static void lookupTerm(String term, MapFile.Reader reader, String collectionPath, FileSystem fs) throws IOException {
    FSDataInputStream collection = fs.open(new Path(collectionPath));
    Text key = new Text();
    PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();
    key.set(term);
    Writable w = reader.get(key, value);
    if (w == null) {
        System.out.println("\nThe term '" + term + "' does not appear in the collection");
        return;
    }
    ArrayListWritable<PairOfInts> postings = value.getRightElement();
    System.out.println("\nComplete postings list for '" + term + "':");
    System.out.println("df = " + value.getLeftElement());
    Int2IntFrequencyDistribution hist = new Int2IntFrequencyDistributionEntry();
    for (PairOfInts pair : postings) {
        hist.increment(pair.getRightElement());
        System.out.print(pair);
        collection.seek(pair.getLeftElement());
        BufferedReader r = new BufferedReader(new InputStreamReader(collection));
        String d = r.readLine();
        d = d.length() > 80 ? d.substring(0, 80) + "..." : d;
        System.out.println(": " + d);
    }
    System.out.println("\nHistogram of tf values for '" + term + "'");
    for (PairOfInts pair : hist) {
        System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
    }
    collection.close();
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayListWritable(tl.lin.data.array.ArrayListWritable) InputStreamReader(java.io.InputStreamReader) Int2IntFrequencyDistribution(tl.lin.data.fd.Int2IntFrequencyDistribution) PairOfInts(tl.lin.data.pair.PairOfInts) Writable(org.apache.hadoop.io.Writable) ArrayListWritable(tl.lin.data.array.ArrayListWritable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) Int2IntFrequencyDistributionEntry(tl.lin.data.fd.Int2IntFrequencyDistributionEntry) PairOfWritables(tl.lin.data.pair.PairOfWritables) BufferedReader(java.io.BufferedReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

PairOfWritables (tl.lin.data.pair.PairOfWritables)7 Path (org.apache.hadoop.fs.Path)6 CommandLine (org.apache.commons.cli.CommandLine)4 CommandLineParser (org.apache.commons.cli.CommandLineParser)4 GnuParser (org.apache.commons.cli.GnuParser)4 HelpFormatter (org.apache.commons.cli.HelpFormatter)4 Options (org.apache.commons.cli.Options)4 ParseException (org.apache.commons.cli.ParseException)4 IntWritable (org.apache.hadoop.io.IntWritable)4 Text (org.apache.hadoop.io.Text)4 FloatWritable (org.apache.hadoop.io.FloatWritable)3 ArrayListWritable (tl.lin.data.array.ArrayListWritable)3 PairOfInts (tl.lin.data.pair.PairOfInts)3 BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 MapFile (org.apache.hadoop.io.MapFile)1 Writable (org.apache.hadoop.io.Writable)1