use of org.apache.nutch.parse.ParseImpl in project nutch by apache.
the class TextProfileSignature method main.
public static void main(String[] args) throws Exception {
TextProfileSignature sig = new TextProfileSignature();
sig.setConf(NutchConfiguration.create());
HashMap<String, byte[]> res = new HashMap<>();
File[] files = new File(args[0]).listFiles();
for (int i = 0; i < files.length; i++) {
FileInputStream fis = new FileInputStream(files[i]);
BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
StringBuffer text = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
if (text.length() > 0)
text.append("\n");
text.append(line);
}
br.close();
byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null));
res.put(files[i].toString(), signature);
}
Iterator<String> it = res.keySet().iterator();
while (it.hasNext()) {
String name = it.next();
byte[] signature = res.get(name);
System.out.println(name + "\t" + StringUtil.toHexString(signature));
}
}
Aggregations