use of org.apache.poi.hsmf.datatypes.MAPIProperty in project poi by apache.
the class TypesLister method list.
private void list(ArrayList<MAPIProperty> list, PrintStream out) {
for (MAPIProperty attr : list) {
String id = Integer.toHexString(attr.id);
while (id.length() < 4) {
id = "0" + id;
}
int typeId = attr.usualType.getId();
String typeIdStr = Integer.toString(typeId);
if (typeId > 0) {
typeIdStr = typeIdStr + " / 0x" + Integer.toHexString(typeId);
}
out.println("0x" + id + " - " + attr.name);
out.println(" " + attr.id + " - " + attr.usualType.getName() + " (" + typeIdStr + ") - " + attr.mapiProperty);
}
}
use of org.apache.poi.hsmf.datatypes.MAPIProperty in project poi by apache.
the class MAPIMessage method guess7BitEncoding.
/**
* Tries to identify the correct encoding for 7-bit (non-unicode)
* strings in the file.
* <p>Many messages store their strings as unicode, which is
* nice and easy. Some use one-byte encodings for their
* strings, but don't always store the encoding anywhere
* helpful in the file.</p>
* <p>This method checks for codepage properties, and failing that
* looks at the headers for the message, and uses these to
* guess the correct encoding for your file.</p>
* <p>Bug #49441 has more on why this is needed</p>
*/
public void guess7BitEncoding() {
// First choice is a codepage property
for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
List<PropertyValue> val = mainChunks.getProperties().get(prop);
if (val != null && val.size() > 0) {
int codepage = ((LongPropertyValue) val.get(0)).getValue();
try {
String encoding = CodePageUtil.codepageToEncoding(codepage, true);
set7BitEncoding(encoding);
return;
} catch (UnsupportedEncodingException e) {
logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ", prop, ", ignoring");
}
}
}
// Second choice is a charset on a content type header
try {
String[] headers = getHeaders();
if (headers != null && headers.length > 0) {
// Look for a content type with a charset
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
for (String header : headers) {
if (header.startsWith("Content-Type")) {
Matcher m = p.matcher(header);
if (m.matches()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
if (!charset.equalsIgnoreCase("utf-8")) {
set7BitEncoding(charset);
}
return;
}
}
}
}
} catch (ChunkNotFoundException e) {
}
// Nothing suitable in the headers, try HTML
try {
String html = getHtmlBody();
if (html != null && html.length() > 0) {
// Look for a content type in the meta headers
Pattern p = Pattern.compile("<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\"");
Matcher m = p.matcher(html);
if (m.find()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
set7BitEncoding(charset);
return;
}
}
} catch (ChunkNotFoundException e) {
}
}
use of org.apache.poi.hsmf.datatypes.MAPIProperty in project poi by apache.
the class HSMFDump method dump.
public void dump(PrintStream out) throws IOException {
ChunkGroup[] chunkGroups = POIFSChunkParser.parse(fs);
for (ChunkGroup chunks : chunkGroups) {
out.println(chunks.getClass().getSimpleName());
for (Chunk chunk : chunks.getChunks()) {
MAPIProperty attr = MAPIProperty.get(chunk.getChunkId());
if (chunk instanceof PropertiesChunk) {
PropertiesChunk props = (PropertiesChunk) chunk;
out.println(" Properties - " + props.getProperties().size() + ":");
for (MAPIProperty prop : props.getProperties().keySet()) {
out.println(" * " + prop);
for (PropertyValue v : props.getValues(prop)) {
out.println(" = " + v);
}
}
} else {
String idName = attr.id + " - " + attr.name;
if (attr == MAPIProperty.UNKNOWN) {
idName = chunk.getChunkId() + " - (unknown)";
}
out.println(" " + idName + " - " + chunk.getType().getName());
out.println(" " + chunk);
}
}
out.println();
}
}
use of org.apache.poi.hsmf.datatypes.MAPIProperty in project tika by apache.
the class OutlookExtractor method handleFromTo.
private void handleFromTo(Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
String from = msg.getDisplayFrom();
metadata.set(TikaCoreProperties.CREATOR, from);
metadata.set(Metadata.MESSAGE_FROM, from);
metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
Chunks chunks = msg.getMainChunks();
StringChunk sentByServerType = chunks.getSentByServerType();
if (sentByServerType != null) {
metadata.set(Office.MAPI_SENT_BY_SERVER_TYPE, sentByServerType.getValue());
}
Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();
List<Chunk> senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
String senderAddressTypeString = "";
if (senderAddresType != null && senderAddresType.size() > 0) {
senderAddressTypeString = senderAddresType.get(0).toString();
}
//sometimes in SMTP .msg files there is an email in the sender name field.
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), Office.MAPI_FROM_REPRESENTING_NAME, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata);
setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), Office.MAPI_FROM_REPRESENTING_EMAIL, metadata);
for (Recipient recipient : buildRecipients()) {
switch(recipient.recipientType) {
case TO:
addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
break;
case CC:
addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
break;
case BCC:
addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
break;
default:
//log unknown or undefined?
break;
}
}
}
use of org.apache.poi.hsmf.datatypes.MAPIProperty in project tika by apache.
the class OutlookExtractor method guess7BitEncoding.
/**
* Tries to identify the correct encoding for 7-bit (non-unicode)
* strings in the file.
* <p>Many messages store their strings as unicode, which is
* nice and easy. Some use one-byte encodings for their
* strings, but don't always store the encoding anywhere
* helpful in the file.</p>
* <p>This method checks for codepage properties, and failing that
* looks at the headers for the message, and uses these to
* guess the correct encoding for your file.</p>
* <p>Bug #49441 has more on why this is needed</p>
* <p>This is taken verbatim from POI (TIKA-1238)
* as a temporary workaround to prevent unsupported encoding exceptions</p>
*/
private void guess7BitEncoding(MAPIMessage msg) {
Chunks mainChunks = msg.getMainChunks();
//sanity check
if (mainChunks == null) {
return;
}
Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
if (props != null) {
// First choice is a codepage property
for (MAPIProperty prop : new MAPIProperty[] { MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID }) {
List<PropertyValue> val = props.get(prop);
if (val != null && val.size() > 0) {
int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
String encoding = null;
try {
encoding = CodePageUtil.codepageToEncoding(codepage, true);
} catch (UnsupportedEncodingException e) {
//swallow
}
if (tryToSet7BitEncoding(msg, encoding)) {
return;
}
}
}
}
// Second choice is a charset on a content type header
try {
String[] headers = msg.getHeaders();
if (headers != null && headers.length > 0) {
// Look for a content type with a charset
Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
for (String header : headers) {
if (header.startsWith("Content-Type")) {
Matcher m = p.matcher(header);
if (m.matches()) {
// Found it! Tell all the string chunks
String charset = m.group(1);
if (tryToSet7BitEncoding(msg, charset)) {
return;
}
}
}
}
}
} catch (ChunkNotFoundException e) {
}
// meta header if there is no other information?
try {
String html = msg.getHtmlBody();
if (html != null && html.length() > 0) {
Charset charset = null;
try {
charset = detector.detect(new ByteArrayInputStream(html.getBytes(UTF_8)), EMPTY_METADATA);
} catch (IOException e) {
//swallow
}
if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
return;
}
}
} catch (ChunkNotFoundException e) {
}
//absolute last resort, try charset detector
StringChunk text = mainChunks.getTextBodyChunk();
if (text != null) {
CharsetDetector detector = new CharsetDetector();
detector.setText(text.getRawValue());
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 35 && tryToSet7BitEncoding(msg, match.getName())) {
return;
}
}
}
Aggregations