use of org.jabref.logic.xmp.EncryptedPdfsNotSupportedException in project jabref by JabRef.
the class PdfContentImporter method importDatabase.
@Override
public ParserResult importDatabase(Path filePath, Charset defaultEncoding) {
final ArrayList<BibEntry> result = new ArrayList<>(1);
try (FileInputStream fileStream = new FileInputStream(filePath.toFile());
PDDocument document = XMPUtil.loadWithAutomaticDecryption(fileStream)) {
String firstPageContents = getFirstPageContents(document);
Optional<DOI> doi = DOI.findInText(firstPageContents);
if (doi.isPresent()) {
ParserResult parserResult = new ParserResult(result);
Optional<BibEntry> entry = new DoiFetcher(importFormatPreferences).performSearchById(doi.get().getDOI());
entry.ifPresent(parserResult.getDatabase()::insertEntry);
return parserResult;
}
// idea: split[] contains the different lines
// blocks are separated by empty lines
// treat each block
// or do special treatment at authors (which are not broken)
// therefore, we do a line-based and not a block-based splitting
// i points to the current line
// curString (mostly) contains the current block
// the different lines are joined into one and thereby separated by " "
lines = firstPageContents.split(System.lineSeparator());
proceedToNextNonEmptyLine();
if (i >= lines.length) {
// return empty list
return new ParserResult();
}
// we start at the current line
curString = lines[i];
// i might get incremented later and curString modified, too
i = i + 1;
String author;
String editor = null;
String abstractT = null;
String keywords = null;
String title;
String conference = null;
String DOI = null;
String series = null;
String volume = null;
String number = null;
String pages = null;
// year is a class variable as the method extractYear() uses it;
String publisher = null;
EntryType type = BibtexEntryTypes.INPROCEEDINGS;
if (curString.length() > 4) {
// special case: possibly conference as first line on the page
extractYear();
if (curString.contains("Conference")) {
fillCurStringWithNonEmptyLines();
conference = curString;
curString = "";
} else {
// e.g. Copyright (c) 1998 by the Genetics Society of America
// future work: get year using RegEx
String lower = curString.toLowerCase(Locale.ROOT);
if (lower.contains("copyright")) {
fillCurStringWithNonEmptyLines();
publisher = curString;
curString = "";
}
}
}
// start: title
fillCurStringWithNonEmptyLines();
title = streamlineTitle(curString);
curString = "";
//i points to the next non-empty line
// after title: authors
author = null;
while ((i < lines.length) && !"".equals(lines[i])) {
// author names are unlikely to be lines among different lines
// treat them line by line
curString = streamlineNames(lines[i]);
if (author == null) {
author = curString;
} else {
if ("".equals(curString)) {
// if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
} else {
author = author.concat(" and ").concat(curString);
}
}
i++;
}
curString = "";
i++;
// then, abstract and keywords follow
while (i < lines.length) {
curString = lines[i];
if ((curString.length() >= "Abstract".length()) && "Abstract".equalsIgnoreCase(curString.substring(0, "Abstract".length()))) {
if (curString.length() == "Abstract".length()) {
// only word "abstract" found -- skip line
curString = "";
} else {
curString = curString.substring("Abstract".length() + 1).trim().concat(System.lineSeparator());
}
i++;
// whereas we need linebreak as separator
while ((i < lines.length) && !"".equals(lines[i])) {
curString = curString.concat(lines[i]).concat(System.lineSeparator());
i++;
}
abstractT = curString.trim();
i++;
} else if ((curString.length() >= "Keywords".length()) && "Keywords".equalsIgnoreCase(curString.substring(0, "Keywords".length()))) {
if (curString.length() == "Keywords".length()) {
// only word "Keywords" found -- skip line
curString = "";
} else {
curString = curString.substring("Keywords".length() + 1).trim();
}
i++;
fillCurStringWithNonEmptyLines();
keywords = removeNonLettersAtEnd(curString);
} else {
String lower = curString.toLowerCase(Locale.ROOT);
int pos = lower.indexOf("technical");
if (pos >= 0) {
type = BibtexEntryTypes.TECHREPORT;
pos = curString.trim().lastIndexOf(' ');
if (pos >= 0) {
// assumption: last character of curString is NOT ' '
// otherwise pos+1 leads to an out-of-bounds exception
number = curString.substring(pos + 1);
}
}
i++;
proceedToNextNonEmptyLine();
}
}
i = lines.length - 1;
while (i >= 0) {
readLastBlock();
// i now points to the block before or is -1
// curString contains the last block, separated by " "
extractYear();
int pos = curString.indexOf("(Eds.)");
if ((pos >= 0) && (publisher == null)) {
// looks like a Springer last line
// e.g: A. Persson and J. Stirna (Eds.): PoEM 2009, LNBIP 39, pp. 161-175, 2009.
publisher = "Springer";
editor = streamlineNames(curString.substring(0, pos - 1));
//+2 because of ":" after (Eds.) and the subsequent space
curString = curString.substring(pos + "(Eds.)".length() + 2);
String[] springerSplit = curString.split(", ");
if (springerSplit.length >= 4) {
conference = springerSplit[0];
String seriesData = springerSplit[1];
int lastSpace = seriesData.lastIndexOf(' ');
series = seriesData.substring(0, lastSpace);
volume = seriesData.substring(lastSpace + 1);
pages = springerSplit[2].substring(4);
if (springerSplit[3].length() >= 4) {
year = springerSplit[3].substring(0, 4);
}
}
} else {
if (DOI == null) {
pos = curString.indexOf("DOI");
if (pos < 0) {
pos = curString.indexOf(FieldName.DOI);
}
if (pos >= 0) {
pos += 3;
char delimiter = curString.charAt(pos);
if ((delimiter == ':') || (delimiter == ' ')) {
pos++;
}
int nextSpace = curString.indexOf(' ', pos);
if (nextSpace > 0) {
DOI = curString.substring(pos, nextSpace);
} else {
DOI = curString.substring(pos);
}
}
}
if ((publisher == null) && curString.contains("IEEE")) {
// IEEE has the conference things at the end
publisher = "IEEE";
if (conference == null) {
pos = curString.indexOf('$');
if (pos > 0) {
// we found the price
// before the price, the ISSN is stated
// skip that
pos -= 2;
while ((pos >= 0) && (curString.charAt(pos) != ' ')) {
pos--;
}
if (pos > 0) {
conference = curString.substring(0, pos);
}
}
}
}
}
}
BibEntry entry = new BibEntry();
entry.setType(type);
if (author != null) {
entry.setField(FieldName.AUTHOR, author);
}
if (editor != null) {
entry.setField(FieldName.EDITOR, editor);
}
if (abstractT != null) {
entry.setField(FieldName.ABSTRACT, abstractT);
}
if (!Strings.isNullOrEmpty(keywords)) {
entry.setField(FieldName.KEYWORDS, keywords);
}
if (title != null) {
entry.setField(FieldName.TITLE, title);
}
if (conference != null) {
entry.setField(FieldName.BOOKTITLE, conference);
}
if (DOI != null) {
entry.setField(FieldName.DOI, DOI);
}
if (series != null) {
entry.setField(FieldName.SERIES, series);
}
if (volume != null) {
entry.setField(FieldName.VOLUME, volume);
}
if (number != null) {
entry.setField(FieldName.NUMBER, number);
}
if (pages != null) {
entry.setField(FieldName.PAGES, pages);
}
if (year != null) {
entry.setField(FieldName.YEAR, year);
}
if (publisher != null) {
entry.setField(FieldName.PUBLISHER, publisher);
}
result.add(entry);
} catch (EncryptedPdfsNotSupportedException e) {
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
} catch (IOException exception) {
return ParserResult.fromError(exception);
} catch (FetcherException e) {
return ParserResult.fromErrorMessage(e.getMessage());
}
return new ParserResult(result);
}
Aggregations