use of tl.lin.data.pair.PairOfWritables in project Cloud9 by lintool.
the class AnalyzeBigramRelativeFrequencyTuple method main.
@SuppressWarnings({ "static-access" })
public static void main(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
System.exit(-1);
}
if (!cmdline.hasOption(INPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(AnalyzeBigramRelativeFrequencyJson.class.getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
System.exit(-1);
}
String inputPath = cmdline.getOptionValue(INPUT);
System.out.println("input path: " + inputPath);
List<PairOfWritables<Tuple, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(inputPath));
List<PairOfWritables<Tuple, FloatWritable>> list1 = Lists.newArrayList();
List<PairOfWritables<Tuple, FloatWritable>> list2 = Lists.newArrayList();
for (PairOfWritables<Tuple, FloatWritable> p : pairs) {
Tuple bigram = p.getLeftElement();
if (bigram.get(0).equals("light")) {
list1.add(p);
}
if (bigram.get(0).equals("contain")) {
list2.add(p);
}
}
Collections.sort(list1, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {
@SuppressWarnings("unchecked")
public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
return e1.getLeftElement().compareTo(e2.getLeftElement());
}
return e2.getRightElement().compareTo(e1.getRightElement());
}
});
Iterator<PairOfWritables<Tuple, FloatWritable>> iter1 = Iterators.limit(list1.iterator(), 10);
while (iter1.hasNext()) {
PairOfWritables<Tuple, FloatWritable> p = iter1.next();
Tuple bigram = p.getLeftElement();
System.out.println(bigram + "\t" + p.getRightElement());
}
Collections.sort(list2, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {
@SuppressWarnings("unchecked")
public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
return e1.getLeftElement().compareTo(e2.getLeftElement());
}
return e2.getRightElement().compareTo(e1.getRightElement());
}
});
Iterator<PairOfWritables<Tuple, FloatWritable>> iter2 = Iterators.limit(list2.iterator(), 10);
while (iter2.hasNext()) {
PairOfWritables<Tuple, FloatWritable> p = iter2.next();
Tuple bigram = p.getLeftElement();
System.out.println(bigram + "\t" + p.getRightElement());
}
}
use of tl.lin.data.pair.PairOfWritables in project Cloud9 by lintool.
the class LookupPostings method lookupTerm.
public static void lookupTerm(String term, MapFile.Reader reader, String collectionPath, FileSystem fs) throws IOException {
FSDataInputStream collection = fs.open(new Path(collectionPath));
Text key = new Text();
PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>();
key.set(term);
Writable w = reader.get(key, value);
if (w == null) {
System.out.println("\nThe term '" + term + "' does not appear in the collection");
return;
}
ArrayListWritable<PairOfInts> postings = value.getRightElement();
System.out.println("\nComplete postings list for '" + term + "':");
System.out.println("df = " + value.getLeftElement());
Int2IntFrequencyDistribution hist = new Int2IntFrequencyDistributionEntry();
for (PairOfInts pair : postings) {
hist.increment(pair.getRightElement());
System.out.print(pair);
collection.seek(pair.getLeftElement());
BufferedReader r = new BufferedReader(new InputStreamReader(collection));
String d = r.readLine();
d = d.length() > 80 ? d.substring(0, 80) + "..." : d;
System.out.println(": " + d);
}
System.out.println("\nHistogram of tf values for '" + term + "'");
for (PairOfInts pair : hist) {
System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement());
}
collection.close();
}
Aggregations