use of com.joliciel.jochre.utils.pdf.PdfImageVisitor in project jochre by urieli.
the class JochreIndexDirectory method getMetaData.
/**
* The metadata contained in the PDF file.
*/
public Map<JochreIndexField, String> getMetaData() {
if (this.metaData == null) {
if (this.getMetaDataFile() == null) {
PdfImageVisitor pdfMetadataReader = new PdfImageVisitor(this.getPdfFile());
Map<String, String> pdfMetaData = pdfMetadataReader.getFields();
pdfMetadataReader.close();
this.metaData = new HashMap<>();
// TODO: hack for Yiddish - need to generalize this through
// config settings
String bookUrl = pdfMetaData.get("Keywords");
String title = pdfMetaData.get("Title");
String author = pdfMetaData.get("Author");
if (bookUrl != null && bookUrl.length() > 0) {
this.metaData.put(JochreIndexField.url, bookUrl);
String id = bookUrl.substring(bookUrl.lastIndexOf('/') + 1);
this.metaData.put(JochreIndexField.id, id);
}
if (title != null && title.length() > 0) {
title = DiacriticRemover.apply(title);
this.metaData.put(JochreIndexField.titleEnglish, title);
}
if (author != null && author.length() > 0)
this.metaData.put(JochreIndexField.authorEnglish, author);
} else {
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document dom = db.parse(this.getMetaDataFile());
Element docElement = dom.getDocumentElement();
XPathFactory xpf = XPathFactory.newInstance();
XPath xp = xpf.newXPath();
String id = xp.evaluate("/metadata/identifier/text()", docElement);
String bookUrl = xp.evaluate("/metadata/identifier-access/text()", docElement);
String title = xp.evaluate("/metadata/title/text()", docElement);
String author = xp.evaluate("/metadata/creator/text()", docElement);
String publisher = xp.evaluate("/metadata/publisher/text()", docElement);
String date = xp.evaluate("/metadata/date/text()", docElement);
String authorLang = xp.evaluate("/metadata/creator-alt-script/text()", docElement);
String titleLang = xp.evaluate("/metadata/title-alt-script/text()", docElement);
String volume = xp.evaluate("/metadata/volume/text()", docElement);
LOG.debug("id: " + id);
LOG.debug("bookUrl: " + bookUrl);
LOG.debug("title: " + title);
LOG.debug("author: " + author);
LOG.debug("publisher: " + publisher);
LOG.debug("date: " + date);
LOG.debug("authorLang: " + authorLang);
LOG.debug("titleLang: " + titleLang);
LOG.debug("volume: " + volume);
this.metaData = new HashMap<>();
if (id.length() > 0)
this.metaData.put(JochreIndexField.id, id);
if (bookUrl.length() > 0)
this.metaData.put(JochreIndexField.url, bookUrl);
if (title.length() > 0)
this.metaData.put(JochreIndexField.titleEnglish, title);
if (author.length() > 0)
this.metaData.put(JochreIndexField.authorEnglish, author);
if (publisher.length() > 0)
this.metaData.put(JochreIndexField.publisher, publisher);
if (date.length() > 0)
this.metaData.put(JochreIndexField.date, date);
if (authorLang.length() > 0)
this.metaData.put(JochreIndexField.author, authorLang);
if (titleLang.length() > 0)
this.metaData.put(JochreIndexField.title, titleLang);
if (volume.length() > 0)
this.metaData.put(JochreIndexField.volume, volume);
} catch (IOException e) {
LOG.error("Failed to read metadata from " + this.getMetaDataFile().getAbsolutePath(), e);
throw new RuntimeException(e);
} catch (XPathExpressionException e) {
LOG.error("Failed to read metadata from " + this.getMetaDataFile().getAbsolutePath(), e);
throw new RuntimeException(e);
} catch (ParserConfigurationException e) {
LOG.error("Failed to read metadata from " + this.getMetaDataFile().getAbsolutePath(), e);
throw new RuntimeException(e);
} catch (SAXException e) {
LOG.error("Failed to read metadata from " + this.getMetaDataFile().getAbsolutePath(), e);
throw new RuntimeException(e);
}
}
}
return metaData;
}
use of com.joliciel.jochre.utils.pdf.PdfImageVisitor in project jochre by urieli.
the class PdfImageReader method readImage.
public BufferedImage readImage(int pageNumber) {
// assuming only one image per PDF page
Set<Integer> pages = new HashSet<>();
pages.add(pageNumber);
PdfImageVisitor pdfImageVisitor = new PdfImageVisitor(this.pdfFile, pages);
PdfImageReaderInternal imageReader = new PdfImageReaderInternal();
pdfImageVisitor.addImageObserver(imageReader);
pdfImageVisitor.visitImages();
BufferedImage image = imageReader.getImage();
return image;
}
use of com.joliciel.jochre.utils.pdf.PdfImageVisitor in project jochre by urieli.
the class YiddishMetaFetcher method fetchMetaData.
public void fetchMetaData(File pdfFile, Writer writer) throws Exception {
PdfImageVisitor pdfMetadataReader = new PdfImageVisitor(pdfFile);
Map<String, String> metadata = pdfMetadataReader.getFields();
pdfMetadataReader.close();
String url = metadata.get("Keywords");
if (url == null || !url.startsWith("http"))
return;
String reference = url.substring(url.lastIndexOf('/') + 1);
URL metaUrl = new URL("https://archive.org/download/" + reference + "/" + reference + "_meta.xml");
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document dom = db.parse(metaUrl.openStream());
Element docElement = dom.getDocumentElement();
XPathFactory xpf = XPathFactory.newInstance();
XPath xp = xpf.newXPath();
String id = xp.evaluate("/metadata/identifier/text()", docElement);
String bookUrl = xp.evaluate("/metadata/identifier-access/text()", docElement);
String title = xp.evaluate("/metadata/title/text()", docElement);
String author = xp.evaluate("/metadata/creator/text()", docElement);
String publisher = xp.evaluate("/metadata/publisher/text()", docElement);
String date = xp.evaluate("/metadata/date/text()", docElement);
String authorYid = xp.evaluate("/metadata/creator-alt-script/text()", docElement);
String titleYid = xp.evaluate("/metadata/title-alt-script/text()", docElement);
String pageCount = xp.evaluate("/metadata/imagecount/text()", docElement);
String volume = xp.evaluate("/metadata/volume/text()", docElement);
title = DiacriticRemover.apply(title);
author = DiacriticRemover.apply(author);
publisher = DiacriticRemover.apply(publisher);
LOG.debug("id: " + id);
LOG.debug("bookUrl: " + bookUrl);
LOG.debug("title: " + title);
LOG.debug("volume: " + volume);
LOG.debug("author: " + author);
LOG.debug("publisher: " + publisher);
LOG.debug("date: " + date);
LOG.debug("authorYid: " + authorYid);
LOG.debug("titleYid: " + titleYid);
LOG.debug("pageCount: " + pageCount);
writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
writer.write("<metadata>\n");
writer.write(" <identifier>" + StringEscapeUtils.escapeXml10(id) + "</identifier>\n");
writer.write(" <identifier-access>" + StringEscapeUtils.escapeXml10(bookUrl) + "</identifier-access>\n");
writer.write(" <title>" + StringEscapeUtils.escapeXml10(title) + "</title>\n");
writer.write(" <volume>" + StringEscapeUtils.escapeXml10(volume) + "</volume>\n");
writer.write(" <creator>" + StringEscapeUtils.escapeXml10(author) + "</creator>\n");
writer.write(" <publisher>" + StringEscapeUtils.escapeXml10(publisher) + "</publisher>\n");
writer.write(" <date>" + StringEscapeUtils.escapeXml10(date) + "</date>\n");
writer.write(" <creator-alt-script>" + StringEscapeUtils.escapeXml10(authorYid) + "</creator-alt-script>\n");
writer.write(" <title-alt-script>" + StringEscapeUtils.escapeXml10(titleYid) + "</title-alt-script>\n");
writer.write(" <imagecount>" + StringEscapeUtils.escapeXml10(pageCount) + "</imagecount>\n");
writer.write("</metadata>\n");
writer.flush();
}
use of com.joliciel.jochre.utils.pdf.PdfImageVisitor in project jochre by urieli.
the class Jochre method doCommandExtractImages.
/**
* Extract the images from a PDF file.
* @param filename
* the path to the PDF file
* @param pages
*/
public void doCommandExtractImages(String filename, File outputDir, Set<Integer> pages) {
if (filename.length() == 0)
throw new RuntimeException("Missing argument: file");
if (filename.toLowerCase().endsWith(".pdf")) {
File pdfFile = new File(filename);
String baseName = this.getBaseName(pdfFile);
List<PdfImageObserver> imageObservers = this.getImageObservers(Arrays.asList(OutputFormat.ImageExtractor), baseName, outputDir);
PdfImageVisitor pdfImageVisitor = new PdfImageVisitor(pdfFile, pages);
for (PdfImageObserver imageObserver : imageObservers) {
pdfImageVisitor.addImageObserver(imageObserver);
}
pdfImageVisitor.visitImages();
} else {
throw new RuntimeException("Unrecognised file extension");
}
}
Aggregations