use of biblemulticonverter.format.paratext.ParatextBook.ParagraphStart in project BibleMultiConverter by schierlm.
the class USFM method doImportBook.
private ParatextBook doImportBook(File inputFile, Charset charset) throws Exception {
KNOWN_CHARACTER_TAGS.addAll(AUTO_CLOSING_TAGS.keySet());
if (!inputFile.getName().toLowerCase().endsWith(".usfm") && !inputFile.getName().toLowerCase().endsWith(".sfm"))
return null;
String data = new String(Files.readAllBytes(inputFile.toPath()), charset).replaceAll("[\\p{Cc}\\p{Z}]+", " ").trim() + "\\$EOF$";
if (!data.startsWith("\\id ")) {
System.out.println("WARNING: Skipping malformed file " + inputFile);
return null;
}
int startPos = data.indexOf("\\", 2);
int finalPos = data.length() - "\\$EOF$".length();
String[] idParts = data.substring(4, startPos).trim().split(" ", 2);
ParatextID id = ParatextID.fromIdentifier(idParts[0].toUpperCase());
if (id == null) {
System.out.println("WARNING: Skipping book with unknown ID: " + idParts[0]);
return null;
}
ParatextBook result = new ParatextBook(id, idParts.length == 1 ? "" : idParts[1]);
List<ParatextCharacterContentContainer> containerStack = new ArrayList<>();
boolean ignoreAutoClosingTags = Boolean.getBoolean("biblemulticonverter.usfm.ignoreautoclosingtags");
while (startPos < finalPos) {
if (data.charAt(startPos) != '\\')
throw new IllegalStateException();
int pos = data.indexOf('\\', startPos + 1);
String textPart = data.substring(startPos + 1, pos);
startPos = pos;
pos = Math.min(textPart.length(), 1 + Math.min((textPart + " ").indexOf(' '), (textPart + "*").indexOf('*')));
String tag = textPart.substring(0, pos).trim().toLowerCase();
textPart = textPart.substring(pos);
if (textPart.endsWith(" ")) {
String nextTag = data.substring(startPos + 1, Math.min(data.length(), startPos + 10)) + " *\\";
pos = Math.min(nextTag.indexOf('\\'), Math.min(nextTag.indexOf(' '), nextTag.indexOf('*')));
if (!KNOWN_CHARACTER_TAGS.contains(nextTag.substring(0, pos))) {
textPart = textPart.substring(0, textPart.length() - 1);
}
}
if (containerStack.isEmpty() && (AUTO_CLOSING_TAGS.containsKey(tag) || tag.equals("v") || FOOTNOTE_XREF_TAGS.containsKey(tag))) {
ParatextCharacterContent container = new ParatextCharacterContent();
result.getContent().add(container);
containerStack.add(container);
}
boolean closeCharacterAttributes = false;
if (PARAGRAPH_TAGS.containsKey(tag)) {
result.getContent().add(new ParagraphStart(PARAGRAPH_TAGS.get(tag)));
closeCharacterAttributes = true;
} else if (tag.endsWith("*")) {
String rawTag = tag.substring(0, tag.length() - 1);
while (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
AutoClosingFormatting acc = (AutoClosingFormatting) containerStack.get(containerStack.size() - 1);
if (acc.getUsedTag().equals(rawTag))
break;
containerStack.remove(containerStack.size() - 1);
}
boolean found = false;
if (AUTO_CLOSING_TAGS.containsKey(rawTag)) {
if (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
AutoClosingFormatting acc = (AutoClosingFormatting) containerStack.get(containerStack.size() - 1);
found = acc.getUsedTag().equals(rawTag);
}
} else if (FOOTNOTE_XREF_TAGS.containsKey(rawTag)) {
if (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof FootnoteXref) {
FootnoteXref fx = (FootnoteXref) containerStack.get(containerStack.size() - 1);
found = fx.getKind().getTag().equals(rawTag);
}
} else {
System.out.println("WARNING: Skipping unknown end tag \\" + tag);
}
if (found) {
containerStack.remove(containerStack.size() - 1);
} else {
System.out.println("WARNING: Skipping mismatched end tag \\" + tag);
}
} else if (AUTO_CLOSING_TAGS.containsKey(tag)) {
if (!tag.startsWith("+") && !ignoreAutoClosingTags) {
while (!containerStack.isEmpty() && containerStack.get(containerStack.size() - 1) instanceof AutoClosingFormatting) {
containerStack.remove(containerStack.size() - 1);
}
}
AutoClosingFormatting nextContainer = new AutoClosingFormatting(AUTO_CLOSING_TAGS.get(tag), tag.startsWith("+"));
containerStack.get(containerStack.size() - 1).getContent().add(nextContainer);
containerStack.add(nextContainer);
if (nextContainer.getKind().getDefaultAttributes() != null && data.startsWith("\\" + tag + "*", startPos) && textPart.contains("|")) {
String[] defaultAttributes = nextContainer.getKind().getDefaultAttributes();
String[] parts = textPart.split("\\|");
for (int i = 1; i < parts.length; i++) {
if (parts[i].contains("=")) {
String attList = parts[i];
while (attList.contains("=")) {
pos = attList.indexOf('=');
String key = attList.substring(0, pos).trim();
attList = attList.substring(pos + 1).trim();
if (attList.startsWith("\"")) {
pos = attList.indexOf('"', 1);
nextContainer.getAttributes().put(key, attList.substring(1, pos));
attList = attList.substring(pos + 1).trim();
} else {
nextContainer.getAttributes().put(key, attList);
attList = "";
}
}
} else if (i - 1 < defaultAttributes.length) {
nextContainer.getAttributes().put(defaultAttributes[i - 1], parts[i]);
}
}
textPart = parts[0];
if (textPart.endsWith(" ")) {
textPart = textPart.substring(0, textPart.length() - 1);
}
}
} else if (tag.equals("v")) {
String[] parts = textPart.split(" ", 2);
containerStack.get(containerStack.size() - 1).getContent().add(new VerseStart(parts[0]));
textPart = parts.length == 1 ? "" : parts[1];
} else if (tag.equals("c")) {
String[] parts = textPart.split(" ", 2);
result.getContent().add(new ChapterStart(Integer.parseInt(parts[0])));
closeCharacterAttributes = true;
textPart = parts.length == 1 ? "" : parts[1];
} else if (tag.matches("t[hc]r?[0-9]+")) {
result.getContent().add(new TableCellStart(tag));
closeCharacterAttributes = true;
} else if (FOOTNOTE_XREF_TAGS.containsKey(tag)) {
String[] parts = textPart.split(" ", 2);
FootnoteXref nextContainer = new FootnoteXref(FOOTNOTE_XREF_TAGS.get(tag), parts[0]);
containerStack.get(containerStack.size() - 1).getContent().add(nextContainer);
containerStack.add(nextContainer);
textPart = parts.length == 1 ? "" : parts[1];
} else if (tag.equals("id")) {
System.out.println("WARNING: Skipping duplicate \\id tag");
textPart = "";
} else if (tag.equals("ide")) {
Charset correctCharset;
try {
if (textPart.matches("[0-9]+ - .*")) {
int codepage = Integer.parseInt(textPart.replaceAll(" - .*", ""));
correctCharset = codepage == 65001 ? StandardCharsets.UTF_8 : Charset.forName("windows-" + codepage);
} else {
correctCharset = Charset.forName(textPart);
}
} catch (UnsupportedCharsetException | IllegalCharsetNameException ex) {
System.out.println("WARNING: Unknown charset " + textPart + " specified, falling back to ISO-8859-1");
correctCharset = StandardCharsets.ISO_8859_1;
}
if (!correctCharset.equals(charset)) {
if (!charset.equals(StandardCharsets.UTF_8)) {
throw new IOException("Two charsets specified: " + charset + " and " + correctCharset);
}
return doImportBook(inputFile, correctCharset);
}
textPart = "";
} else if (ATTRIBUTE_TAGS.contains(tag)) {
result.getAttributes().put(tag, textPart);
textPart = "";
} else {
System.out.println("WARNING: Skipping unknown tag \\" + tag);
}
if (closeCharacterAttributes) {
containerStack.clear();
}
if (!textPart.isEmpty()) {
if (containerStack.isEmpty()) {
ParatextCharacterContent container = new ParatextCharacterContent();
containerStack.add(container);
result.getContent().add(container);
}
textPart = textPart.replace(" // ", " ").replace("~", "\u00A0");
containerStack.get(containerStack.size() - 1).getContent().add(new ParatextCharacterContent.Text(textPart));
}
}
return result;
}
Aggregations