use of org.apache.jempbox.xmp.XMPMetadata in project jabref by JabRef.
the class XMPUtilMain method main.
/**
* Command-line tool for working with XMP-data.
*
* Read or write XMP-metadata from or to pdf file.
*
* Usage:
* <dl>
* <dd>Read from PDF and print as bibtex:</dd>
* <dt>xmpUtil PDF</dt>
* <dd>Read from PDF and print raw XMP:</dd>
* <dt>xmpUtil -x PDF</dt>
* <dd>Write the entry in BIB given by KEY to the PDF:</dd>
* <dt>xmpUtil KEY BIB PDF</dt>
* <dd>Write all entries in BIB to the PDF:</dd>
* <dt>xmpUtil BIB PDF</dt>
* </dl>
*
* @param args
* Command line strings passed to utility.
* @throws IOException
* If any of the given files could not be read or written.
* @throws TransformerException
* If the given BibEntry is malformed.
*/
public static void main(String[] args) throws IOException, TransformerException {
// Don't forget to initialize the preferences
if (Globals.prefs == null) {
Globals.prefs = JabRefPreferences.getInstance();
}
XMPPreferences xmpPreferences = Globals.prefs.getXMPPreferences();
ImportFormatPreferences importFormatPreferences = Globals.prefs.getImportFormatPreferences();
switch(args.length) {
case 0:
usage();
break;
case 1:
if (args[0].endsWith(".pdf")) {
// Read from pdf and write as BibTex
List<BibEntry> l = XMPUtil.readXMP(new File(args[0]), xmpPreferences);
BibEntryWriter bibtexEntryWriter = new BibEntryWriter(new LatexFieldFormatter(Globals.prefs.getLatexFieldFormatterPreferences()), false);
for (BibEntry entry : l) {
StringWriter sw = new StringWriter();
bibtexEntryWriter.write(entry, sw, BibDatabaseMode.BIBTEX);
System.out.println(sw.getBuffer());
}
} else if (args[0].endsWith(".bib")) {
// Read from BIB and write as XMP
try (FileReader fr = new FileReader(args[0])) {
ParserResult result = new BibtexParser(importFormatPreferences).parse(fr);
Collection<BibEntry> entries = result.getDatabase().getEntries();
if (entries.isEmpty()) {
System.err.println("Could not find BibEntry in " + args[0]);
} else {
System.out.println(XMPUtil.toXMP(entries, result.getDatabase(), xmpPreferences));
}
}
} else {
usage();
}
break;
case 2:
if ("-x".equals(args[0]) && args[1].endsWith(".pdf")) {
// Read from pdf and write as BibTex
Optional<XMPMetadata> meta = XMPUtil.readRawXMP(new File(args[1]));
if (meta.isPresent()) {
XMLUtil.save(meta.get().getXMPDocument(), System.out, StandardCharsets.UTF_8.name());
} else {
System.err.println("The given pdf does not contain any XMP-metadata.");
}
break;
}
if (args[0].endsWith(".bib") && args[1].endsWith(".pdf")) {
ParserResult result = new BibtexParser(importFormatPreferences).parse(new FileReader(args[0]));
Collection<BibEntry> entries = result.getDatabase().getEntries();
if (entries.isEmpty()) {
System.err.println("Could not find BibEntry in " + args[0]);
} else {
XMPUtil.writeXMP(new File(args[1]), entries, result.getDatabase(), false, xmpPreferences);
System.out.println("XMP written.");
}
break;
}
usage();
break;
case 3:
if (!args[1].endsWith(".bib") && !args[2].endsWith(".pdf")) {
usage();
break;
}
ParserResult result = new BibtexParser(importFormatPreferences).parse(new FileReader(args[1]));
Optional<BibEntry> bibEntry = result.getDatabase().getEntryByKey(args[0]);
if (bibEntry.isPresent()) {
XMPUtil.writeXMP(new File(args[2]), bibEntry.get(), result.getDatabase(), xmpPreferences);
System.out.println("XMP written.");
} else {
System.err.println("Could not find BibEntry " + args[0] + " in " + args[0]);
}
break;
default:
usage();
}
}
use of org.apache.jempbox.xmp.XMPMetadata in project tika by apache.
the class JempboxExtractor method parse.
public void parse(InputStream file) throws IOException, TikaException {
ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
if (!scanner.parse(file, xmpraw)) {
return;
}
XMPMetadata xmp = null;
try (InputStream decoded = new ByteArrayInputStream(xmpraw.toByteArray())) {
Document dom = new ParseContext().getDocumentBuilder().parse(decoded);
if (dom != null) {
xmp = new XMPMetadata(dom);
}
} catch (IOException | SAXException e) {
//
}
extractDublinCore(xmp, metadata);
extractXMPMM(xmp, metadata);
}
use of org.apache.jempbox.xmp.XMPMetadata in project tika by apache.
the class PDFParser method extractMetadata.
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) throws TikaException {
//first extract AccessPermissions
AccessPermission ap = document.getCurrentAccessPermission();
metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(ap.canExtractForAccessibility()));
metadata.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(ap.canExtractContent()));
metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, Boolean.toString(ap.canAssembleDocument()));
metadata.set(AccessPermissions.FILL_IN_FORM, Boolean.toString(ap.canFillInForm()));
metadata.set(AccessPermissions.CAN_MODIFY, Boolean.toString(ap.canModify()));
metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, Boolean.toString(ap.canModifyAnnotations()));
metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
//now go for the XMP
Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), metadata, context);
XMPMetadata xmp = null;
if (dom != null) {
xmp = new XMPMetadata(dom);
}
XMPSchemaDublinCore dcSchema = null;
if (xmp != null) {
try {
dcSchema = xmp.getDublinCoreSchema();
} catch (IOException e) {
}
JempboxExtractor.extractXMPMM(xmp, metadata);
}
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
addMetadata(metadata, PDF.DOC_INFO_TITLE, info.getTitle());
extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
addMetadata(metadata, PDF.DOC_INFO_CREATOR, info.getAuthor());
extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
addMetadata(metadata, PDF.DOC_INFO_CREATOR_TOOL, info.getCreator());
addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
addMetadata(metadata, PDF.DOC_INFO_KEY_WORDS, info.getKeywords());
addMetadata(metadata, "producer", info.getProducer());
addMetadata(metadata, PDF.DOC_INFO_PRODUCER, info.getProducer());
extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
addMetadata(metadata, PDF.DOC_INFO_SUBJECT, info.getSubject());
// TODO: Move to description in Tika 2.0
addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
addMetadata(metadata, "trapped", info.getTrapped());
addMetadata(metadata, PDF.DOC_INFO_TRAPPED, info.getTrapped());
// TODO Remove these in Tika 2.0
addMetadata(metadata, "created", info.getCreationDate());
addMetadata(metadata, PDF.DOC_INFO_CREATED, info.getCreationDate());
addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
Calendar modified = info.getModificationDate();
addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
addMetadata(metadata, PDF.DOC_INFO_MODIFICATION_DATE, info.getModificationDate());
// All remaining metadata is custom
// Copy this over as-is
List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped");
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key));
addMetadata(metadata, PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name, info.getCOSObject().getDictionaryObject(key));
}
}
//try to get the various versions
//Caveats:
// there is currently a fair amount of redundancy
// TikaCoreProperties.FORMAT can be multivalued
// There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
metadata.set(PDF.PDF_VERSION, Float.toString(document.getDocument().getVersion()));
metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=" + Float.toString(document.getDocument().getVersion()));
try {
if (xmp != null) {
xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class);
if (pdfaxmp != null) {
if (pdfaxmp.getPart() != null) {
metadata.set(PDF.PDFAID_PART, Integer.toString(pdfaxmp.getPart()));
}
if (pdfaxmp.getConformance() != null) {
metadata.set(PDF.PDFAID_CONFORMANCE, pdfaxmp.getConformance());
String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
metadata.set(PDF.PDFA_VERSION, version);
metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + version + "\"");
}
}
// TODO WARN if this XMP version is inconsistent with document header version?
}
} catch (IOException e) {
metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e);
}
//TODO: Let's try to move this into PDFBox.
//Attempt to determine Adobe extension level, if present:
COSDictionary root = document.getDocumentCatalog().getCOSObject();
COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions"));
if (extensions != null) {
for (COSName extName : extensions.keySet()) {
// If it's an Adobe one, interpret it to determine the extension level:
if (extName.equals(COSName.getPDFName("ADBE"))) {
COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName);
if (adobeExt != null) {
String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
//-1 is sentinel value that something went wrong in getInt
if (el != -1) {
metadata.set(PDF.PDF_EXTENSION_VERSION, baseVersion + " Adobe Extension Level " + el);
metadata.add(TikaCoreProperties.FORMAT.getName(), MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\"");
}
}
} else {
// WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'.
metadata.set("pdf:foundNonAdobeExtensionName", extName.getName());
}
}
}
}
use of org.apache.jempbox.xmp.XMPMetadata in project OpenOLAT by OpenOLAT.
the class PdfDocument method addMetadata.
public void addMetadata(String title, String subject, String author) throws IOException, TransformerException {
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentInformation info = document.getDocumentInformation();
Calendar date = Calendar.getInstance();
info.setAuthor(author);
info.setCreator(author);
info.setCreationDate(date);
info.setModificationDate(date);
info.setTitle(title);
info.setSubject(subject);
XMPMetadata metadata = new XMPMetadata();
XMPSchemaPDF pdfSchema = metadata.addPDFSchema();
pdfSchema.setProducer("OpenOLAT");
XMPSchemaBasic basicSchema = metadata.addBasicSchema();
basicSchema.setModifyDate(date);
basicSchema.setCreateDate(date);
basicSchema.setCreatorTool("OpenOLAT");
basicSchema.setMetadataDate(date);
XMPSchemaDublinCore dcSchema = metadata.addDublinCoreSchema();
dcSchema.setTitle(title);
dcSchema.addCreator(author);
dcSchema.setDescription(subject);
PDMetadata metadataStream = new PDMetadata(document);
metadataStream.importXMPMetadata(metadata);
catalog.setMetadata(metadataStream);
}
Aggregations