use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.
the class EmailUtils method getYearWiseAttachments.
/**
* This method returns the Year wise attachment counts for all the emails in the set docs provided that the attachment type is of interest (as passed
* by the front end in the query parameter params). If params is null then this information is passed for all the attachments present in docs.
*
* @param docs
* @param params
* @return two entries. Second variable is map of year and the count of attachments in that year and the first boolean variable denotes if in that map there is an year (1960)
* which corresponds to the notion of hacky date (1 January 1960)
*/
public static Pair<Boolean, Map<Integer, Integer>> getYearWiseAttachments(Collection<Document> docs, Multimap<String, String> params) {
String archiveID = JSPHelper.getParam(params, "archiveID");
Archive archive = ArchiveReaderWriter.getArchiveForArchiveID(archiveID);
boolean isHacky = false;
Pattern pattern = null;
try {
pattern = Pattern.compile(EmailRenderer.EXCLUDED_EXT);
} catch (Exception e) {
Util.report_exception(e);
}
Set<String> attachmentTypes = null;
if (params != null)
attachmentTypes = IndexUtils.getAttachmentExtensionsOfInterest(params);
Map<Integer, Integer> result = new LinkedHashMap<>();
for (Document d : docs) {
EmailDocument ed = (EmailDocument) d;
// if ed has nonzero attachments then update that number for the year of this email.
if (ed.attachments.size() != 0) {
// get year of the email.
Calendar calendear = Calendar.getInstance();
// to get the starting year
calendear.setTime(ed.date);
int startYear = calendear.get(Calendar.YEAR);
if (!result.containsKey(startYear)) {
result.put(startYear, 0);
}
// Don't put the count of all attachments but only those whose extension type is present in the set attachmentTypes (if it is non-null)
List<Blob> attachments = ed.attachments;
int count = 0;
if (attachments != null)
for (Blob b : attachments) {
String ext = Util.getExtension(archive.getBlobStore().get_URL_Normalized(b));
if (ext == null)
ext = "Unidentified";
ext = ext.toLowerCase();
if (pattern.matcher(ext).find()) {
// don't consider any attachment that has extension of the form [0-9]+
continue;
}
if (attachmentTypes != null && !attachmentTypes.contains(ext))
continue;
count++;
}
// add count to the map.
int updatedval = result.get(startYear) + count;
// if count is nonzero and this message has hacky date then set the variable isHacky to true.
if (count != 0 && ed.hackyDate)
isHacky = true;
result.put(startYear, updatedval);
}
}
return new Pair(isHacky, result);
}
use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.
the class Indexer method moveDocAndAttachmentsToThisIndex.
/*
Moves edoc from src to dest index. Here src is source indexer and the current object (Indexer) is where
the doc is going to be moved.
The associated attachments also need to be moved from srcBlobStore to destBlobStore.
Don't forget to call pack()[BlobStore] after calling this method for every new document that has been added.
*/
public void moveDocAndAttachmentsToThisIndex(Indexer srcindexer, EmailDocument edoc, BlobStore srcBlobStore, BlobStore destBlobStore) throws IOException {
// prepare writers if not done by the caller
if (iwriter == null)
iwriter = openIndexWriter(directory);
if (iwriter_blob == null) {
// if (directory_blob == null) directory_blob = initializeDirectory(directory_blob, INDEX_NAME_ATTACHMENTS); // should already be valid
iwriter_blob = openIndexWriter(directory_blob);
}
// get lucenedocid of doc wrt src Indexer first without attachment then with attachment.
org.apache.lucene.document.Document dsrc = srcindexer.getLDoc(edoc.getUniqueId());
if (dsrc == null)
// This should not happen.. @TODO debug it if it happens.
return;
LinkedList<org.apache.lucene.document.Document> dattachments = new LinkedList<>();
// create a new lucenedoc for adding to iwriter.
org.apache.lucene.document.Document newdoc = new org.apache.lucene.document.Document();
// copy fields from dsrc to newdoc
dsrc.getFields().forEach(newdoc::add);
// add to map docIDtoEmailDoc
docIdToEmailDoc.put(edoc.getUniqueId(), edoc);
for (Blob b : edoc.attachments) {
// /NOTE: For now we are adding blob irrespective of whether it was present in destblobstore or not.
// get blobid for this blob
String id = Integer.toString(srcBlobStore.index(b));
// get lucendoc for this docid.
org.apache.lucene.document.Document attachmentsrc = srcindexer.getLDocAttachment(id);
if (attachmentsrc == null)
// This should not happen.. @TODO debug it if it happens.
return;
// create new doc and add to list of dattachments (for adding it to index at the end atomically)
org.apache.lucene.document.Document newattachmentdoc = new org.apache.lucene.document.Document();
dattachments.add(newattachmentdoc);
// copy all fields from attachmentsrc to newattachmentdoc except docid that will be the number
// obtained after adding blob to destblobstore and then getting it's index.
attachmentsrc.getFields().stream().forEach(field -> {
if (!"docId".equals(field.name())) {
newattachmentdoc.add(field);
}
});
String urlstring = srcBlobStore.get_URL_Normalized(b);
URL url = new URL(urlstring);
destBlobStore.add(b, url.openStream());
String newid = Integer.toString(destBlobStore.index(b));
newattachmentdoc.add(new Field("docId", newid, ft));
attachmentDocIdToBlob.put(newid, b);
}
// add newdoc to iwriter and all docattachments present in dattachments list to iwriter_blob.
iwriter.addDocument(newdoc);
for (org.apache.lucene.document.Document d : dattachments) {
iwriter_blob.addDocument(d);
}
}
use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.
the class IndexUtils method partitionAttachmentsByAttachmentType.
/**
* note: attachment types are lower-cased
* This function is exactly same as partitionDocsByAttachmentTypes except here the count represents the number of attachments instead of
* number of documents of that attachment type. So the semantics is
* Attachment type 1 -> Number of attachmets of that type,
* Attachment type 2 -> Number of attachments of that type
*/
private static Map<String, DetailedFacetItem> partitionAttachmentsByAttachmentType(Archive archive, Collection<? extends Document> docs, Set<String> attachmentExtensionsOfInterest) {
Map<String, DetailedFacetItem> result = new LinkedHashMap<>();
Pattern pattern = null;
try {
pattern = Pattern.compile(EmailRenderer.EXCLUDED_EXT);
} catch (Exception e) {
Util.report_exception(e);
return result;
}
// this index is used to create dummy email doc. Here for each attachment we should create one document
int indexToDifferentiate = 0;
for (Document d : docs) {
if (!(d instanceof EmailDocument))
continue;
EmailDocument ed = (EmailDocument) d;
List<Blob> attachments = ed.attachments;
if (attachments != null)
for (Blob b : attachments) {
String ext = Util.getExtension(archive.getBlobStore().get_URL_Normalized(b));
if (ext == null)
ext = "Unidentified";
ext = ext.toLowerCase();
if (pattern.matcher(ext).find()) {
// don't consider any attachment that has extension of the form [0-9]+
continue;
}
if (attachmentExtensionsOfInterest != null && !attachmentExtensionsOfInterest.contains(ext))
continue;
DetailedFacetItem dfi = result.get(ext);
if (dfi == null) {
dfi = new DetailedFacetItem(ext, "number of " + ext + " attachments in this set of messages.", "attachmentExtension", ext);
result.put(ext, dfi);
}
// dfi.addDoc(ed);
// create a dummy doc such that no two docs are same (by appending an incrementing number in messageID.
// This is fine as long as we don't use the docset present in this DetaileFacetItem for anything other than counting purpose.
EmailDocument edummy = new EmailDocument(ed.id, ed.emailSource, ed.folderName, ed.to, ed.cc, ed.bcc, ed.from, ed.getSubjectWithoutTitle(), ed.messageID + indexToDifferentiate, ed.date);
// add it to dfi.
dfi.addDoc(edummy);
indexToDifferentiate++;
}
}
return result;
}
use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.
the class IndexUtils method getBlobsForAttachments.
/**
* returns set of all blobs that have an attachment that ends in ANY one of
* the given tails
*/
public static Set<Blob> getBlobsForAttachments(Collection<? extends Document> docs, String[] attachmentTails, BlobStore attachmentsStore) {
Set<Blob> result = new LinkedHashSet<>();
if (attachmentTails == null)
// empty results
return result;
if (attachmentsStore == null) {
JSPHelper.log.error("No attachments store!");
return result;
}
Set<String> neededAttachmentTails = new LinkedHashSet<>();
Collections.addAll(neededAttachmentTails, attachmentTails);
for (Document d : docs) {
if (!(d instanceof EmailDocument))
continue;
EmailDocument ed = (EmailDocument) d;
if (ed.attachments == null)
continue;
for (Blob b : ed.attachments) {
String url = attachmentsStore.getRelativeURL(b);
String urlTail = Util.URLtail(url);
if (neededAttachmentTails.contains(urlTail)) {
result.add(b);
}
}
}
return result;
}
Aggregations