use of org.jsoup.nodes.Element in project AozoraEpub3 by hmdev.
the class WebAozoraConverter method convertToAozoraText.
////////////////////////////////////////////////////////////////
/** 変換実行
* @param urlString
* @param cachePath
* @param interval
* @param modifiedExpire この時間以内のキャッシュを更新分として扱う
* @param convertUpdated 更新時のみ出力
* @param convertModifiedOnly 追加更新分のみ変換
* @param convertModifiedTail 最新話から連続したもののみ変換
* @param beforeChapter 指定話数のみ変換 0は指定無し
* @return 変換スキップやキャンセルならnullを返す */
public File convertToAozoraText(String urlString, File cachePath, int interval, float modifiedExpire, boolean convertUpdated, boolean convertModifiedOnly, boolean convertModifiedTail, int beforeChapter) throws IOException {
this.canceled = false;
//日付一覧が取得できない場合は常に更新
this.updated = true;
this.interval = Math.max(500, interval);
this.modifiedExpire = Math.max(0, modifiedExpire);
this.convertUpdated = convertUpdated;
this.convertModifiedOnly = convertModifiedOnly;
this.convertModifiedTail = convertModifiedTail;
this.beforeChapter = beforeChapter;
//末尾の / をリダイレクトで取得
urlString = urlString.trim();
if (!urlString.endsWith("/") && !urlString.endsWith(".html") && !urlString.endsWith(".htm") && urlString.indexOf("?") == -1) {
HttpURLConnection connection = null;
try {
connection = (HttpURLConnection) new URL(urlString + "/").openConnection();
if (connection.getResponseCode() == 200) {
urlString += "/";
LogAppender.println("URL修正 : " + urlString);
}
} catch (Exception e) {
} finally {
if (connection != null)
connection.disconnect();
}
}
this.urlString = urlString;
this.baseUri = urlString.substring(0, urlString.indexOf('/', urlString.indexOf("//") + 2));
//String fqdn = baseUri.substring(baseUri.indexOf("//")+2);
String listBaseUrl = urlString.substring(0, urlString.lastIndexOf('/') + 1);
this.pageBaseUri = listBaseUrl;
//http://を除外
String urlFilePath = CharUtils.escapeUrlToFile(urlString.substring(urlString.indexOf("//") + 2));
//http://を除外した文字列で比較
/*ExtractInfo[] extractInfos = this.queryMap.get(ExtractId.PAGE_REGEX);
if(extractInfos != null) {
if (!extractInfos[0].matches(urlString)) {
LogAppender.println("読み込み可能なURLではありません");
return null;
}
}*/
String urlParentPath = urlFilePath;
boolean isPath = false;
if (urlFilePath.endsWith("/")) {
isPath = true;
urlFilePath += "index.html";
} else
urlParentPath = urlFilePath.substring(0, urlFilePath.lastIndexOf('/') + 1);
//変換結果
this.dstPath = cachePath.getAbsolutePath() + "/";
if (isPath)
this.dstPath += urlParentPath;
else
this.dstPath += urlFilePath + "_converted/";
File txtFile = new File(this.dstPath + "converted.txt");
//表紙画像はtxtと同じ名前で保存 拡張子はpngだが表示はできるのでそのまま
File coverImageFile = new File(this.dstPath + "converted.png");
//更新情報格納先
File updateInfoFile = new File(this.dstPath + "update.txt");
//フォルダ以外がすでにあったら削除
File parentFile = txtFile.getParentFile();
if (parentFile.exists() && !parentFile.isDirectory()) {
parentFile.delete();
}
parentFile.mkdirs();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFile), "UTF-8"));
try {
//urlStringのファイルをキャッシュ
File cacheFile = new File(cachePath.getAbsolutePath() + "/" + urlFilePath);
try {
LogAppender.append(urlString);
cacheFile(urlString, cacheFile, null);
LogAppender.println(" : List Loaded.");
} catch (Exception e) {
e.printStackTrace();
LogAppender.println("一覧ページの取得に失敗しました。 ");
if (!cacheFile.exists())
return null;
LogAppender.println("キャッシュファイルを利用します。");
}
//パスならlist.txtの情報を元にキャッシュ後に青空txt変換して改ページで繋げて出力
Document doc = Jsoup.parse(cacheFile, null);
//表紙画像
Elements images = getExtractElements(doc, this.queryMap.get(ExtractId.COVER_IMG));
if (images != null) {
printImage(null, images.get(0), coverImageFile);
}
//タイトル
boolean hasTitle = false;
String series = getExtractText(doc, this.queryMap.get(ExtractId.SERIES));
if (series != null) {
printText(bw, series);
bw.append('\n');
hasTitle = true;
}
String title = getExtractText(doc, this.queryMap.get(ExtractId.TITLE));
if (title != null) {
printText(bw, title);
bw.append('\n');
hasTitle = true;
}
if (!hasTitle) {
LogAppender.println("SERIES/TITLE : タイトルがありません");
return null;
}
//著者
String author = getExtractText(doc, this.queryMap.get(ExtractId.AUTHOR));
if (author != null) {
printText(bw, author);
}
bw.append('\n');
//説明
Element description = getExtractFirstElement(doc, this.queryMap.get(ExtractId.DESCRIPTION));
if (description != null) {
bw.append('\n');
bw.append("[#区切り線]\n");
bw.append('\n');
bw.append("[#ここから2字下げ]\n");
bw.append("[#ここから2字上げ]\n");
printNode(bw, description, true);
bw.append('\n');
bw.append("[#ここで字上げ終わり]\n");
bw.append("[#ここで字下げ終わり]\n");
bw.append('\n');
bw.append("[#区切り線]\n");
bw.append('\n');
}
String contentsUpdate = getExtractText(doc, this.queryMap.get(ExtractId.UPDATE));
//章名称 変わった場合に出力
String preChapterTitle = "";
//各話のURL(フルパス)を格納
Vector<String> chapterHrefs = new Vector<String>();
Elements hrefs = getExtractElements(doc, this.queryMap.get(ExtractId.HREF));
if (hrefs == null && this.queryMap.containsKey(ExtractId.HREF)) {
LogAppender.println("HREF : 各話のリンク先URLが取得できません");
}
Vector<String> subtitles = getExtractStrings(doc, this.queryMap.get(ExtractId.SUBTITLE_LIST), true);
if (subtitles == null && this.queryMap.containsKey(ExtractId.SUBTITLE_LIST)) {
LogAppender.println("SUBTITLE_LIST : 各話タイトルが取得できません");
}
//更新のない各話のURL(フルパス)を格納
//nullならキャッシュ更新無しで、空ならすべて更新される
HashSet<String> noUpdateUrls = null;
String[] postDateList = null;
if (hrefs == null) {
//ページ番号取得
String pageNumString = getExtractText(doc, this.queryMap.get(ExtractId.PAGE_NUM));
if (pageNumString == null && this.queryMap.containsKey(ExtractId.PAGE_NUM)) {
LogAppender.println("PAGE_NUM : ページ数が取得できません");
}
int pageNum = -1;
try {
pageNum = Integer.parseInt(pageNumString);
} catch (Exception e) {
}
Element pageUrlElement = getExtractFirstElement(doc, this.queryMap.get(ExtractId.PAGE_URL));
if (pageUrlElement == null && this.queryMap.containsKey(ExtractId.PAGE_URL)) {
LogAppender.println("PAGE_URL : ページ番号用のURLが取得できません");
}
if (pageNum > 0 && pageUrlElement != null) {
ExtractInfo pageUrlExtractInfo = this.queryMap.get(ExtractId.PAGE_URL)[0];
//リンク生成 1~ページ番号まで
for (int i = 1; i <= pageNum; i++) {
String pageUrl = pageUrlElement.attr("href");
if (pageUrl != null) {
pageUrl = pageUrlExtractInfo.replace(pageUrl + "\t" + i);
if (pageUrl != null) {
if (!pageUrl.startsWith("http")) {
if (pageUrl.charAt(0) == '/')
pageUrl = baseUri + pageUrl;
else
pageUrl = listBaseUrl + pageUrl;
}
chapterHrefs.add(pageUrl);
}
}
}
} else {
Elements contentDivs = getExtractElements(doc, this.queryMap.get(ExtractId.CONTENT_ARTICLE));
if (contentDivs != null) {
//一覧のリンクはないが本文がある場合
docToAozoraText(bw, doc, false, null, null);
} else {
LogAppender.println("一覧のURLが取得できませんでした");
return null;
}
}
} else {
//更新分のみ取得するようにするためhrefに対応した日付タグの文字列(innerHTML)を取得して保存しておく
Elements updates = getExtractElements(doc, this.queryMap.get(ExtractId.SUB_UPDATE));
if (updates == null && this.queryMap.containsKey(ExtractId.SUB_UPDATE)) {
LogAppender.println("SUB_UPDATE : 更新確認情報が取得できません");
}
if (updates != null) {
//更新しないURLのチェック用
noUpdateUrls = createNoUpdateUrls(updateInfoFile, urlString, listBaseUrl, contentsUpdate, hrefs, updates);
}
//一覧のhrefをすべて取得
for (Element href : hrefs) {
String hrefString = href.attr("href");
if (hrefString == null || hrefString.length() == 0)
continue;
//パターンがあればマッチング
ExtractInfo extractInfo = this.queryMap.get(ExtractId.HREF)[0];
if (!extractInfo.hasPattern() || extractInfo.matches(hrefString)) {
String chapterHref = hrefString;
if (!hrefString.startsWith("http")) {
if (hrefString.charAt(0) == '/')
chapterHref = baseUri + hrefString;
else
chapterHref = listBaseUrl + hrefString;
}
chapterHrefs.add(chapterHref);
}
}
postDateList = getPostDateList(doc, this.queryMap.get(ExtractId.CONTENT_UPDATE_LIST));
if (postDateList == null && this.queryMap.containsKey(ExtractId.CONTENT_UPDATE_LIST)) {
LogAppender.println("CONTENT_UPDATE_LIST : 一覧ページの更新日時情報が取得できません");
}
}
if (chapterHrefs.size() > 0) {
//全話で更新や追加があるかチェック
updated = false;
//追加更新対象の期限 これより大きければ追加更新
long expire = System.currentTimeMillis() - (long) (this.modifiedExpire * 3600000);
//追加更新分のみ出力時に利用
HashSet<Integer> modifiedChapterIdx = null;
//更新されていない最後の話数 0~
int lastNoModifiedChapterIdx = -1;
if (this.convertModifiedOnly) {
modifiedChapterIdx = new HashSet<Integer>();
}
int chapterIdx = 0;
for (String chapterHref : chapterHrefs) {
if (this.canceled)
return null;
if (chapterHref != null && chapterHref.length() > 0) {
//画像srcをフルパスにするときに使うページのパス
this.pageBaseUri = chapterHref;
if (!chapterHref.endsWith("/")) {
int idx = chapterHref.indexOf('/', 7);
if (idx > -1)
this.pageBaseUri = chapterHref.substring(0, idx);
}
//キャッシュ取得 ロードされたらWait 500ms
String chapterPath = CharUtils.escapeUrlToFile(chapterHref.substring(chapterHref.indexOf("//") + 2));
File chapterCacheFile = new File(cachePath.getAbsolutePath() + "/" + chapterPath + (chapterPath.endsWith("/") ? "index.html" : ""));
//hrefsのときは更新分のみurlsに入っている
boolean loaded = false;
//更新対象ならtrueに変更
boolean reload = false;
//nullでなく更新無しに含まれなければ再読込
if (noUpdateUrls != null && !noUpdateUrls.contains(chapterHref))
reload = true;
if (reload || !chapterCacheFile.exists()) {
LogAppender.append("[" + (chapterIdx + 1) + "/" + chapterHrefs.size() + "] " + chapterHref);
try {
try {
Thread.sleep(this.interval);
} catch (InterruptedException e) {
}
cacheFile(chapterHref, chapterCacheFile, urlString);
LogAppender.println(" : Loaded.");
//ファイルがロードされたら更新有り
this.updated = true;
loaded = true;
} catch (Exception e) {
e.printStackTrace();
LogAppender.println("htmlファイルが取得できませんでした : " + chapterHref);
}
}
//キャッシュされているファイルが指定時間内なら更新扱い
if (!loaded) {
if (this.modifiedExpire > 0 && (this.convertModifiedOnly || this.convertUpdated) && chapterCacheFile.lastModified() >= expire) {
LogAppender.append("[" + (chapterIdx + 1) + "/" + chapterHrefs.size() + "] " + chapterHref);
LogAppender.println(" : Modified.");
this.updated = true;
}
}
//更新分のみ出力時のチェック
if (this.convertModifiedOnly) {
//ファイルの更新日時で比較
if (chapterCacheFile.lastModified() >= expire) {
modifiedChapterIdx.add(chapterIdx);
} else {
if (this.convertModifiedTail) {
//最新から連続していない話は除外
modifiedChapterIdx.clear();
}
lastNoModifiedChapterIdx = chapterIdx;
}
}
}
chapterIdx++;
}
//更新が無くて変換もなければ終了
if (!this.updated) {
LogAppender.append("「" + title + "」");
LogAppender.println("の更新はありません");
if (this.convertUpdated)
return null;
}
if (this.convertModifiedOnly) {
//更新前の話数を追加 昇順で重複もはじく
if (this.beforeChapter > 0) {
int startIdx = Math.max(0, lastNoModifiedChapterIdx - this.beforeChapter + 1);
if (modifiedChapterIdx.size() == 0) {
//追加分なし
int idx = chapterHrefs.size() - 1;
for (int i = 0; i < this.beforeChapter; i++) {
modifiedChapterIdx.add(idx--);
}
} else {
//追加分あり
for (int i = startIdx; i <= lastNoModifiedChapterIdx; i++) {
modifiedChapterIdx.add(i);
}
}
}
if (modifiedChapterIdx.size() == 0) {
LogAppender.println("追加更新分はありません");
this.updated = false;
return null;
}
} else {
//最新話数指定
if (this.beforeChapter > 0) {
int idx = chapterHrefs.size() - 1;
modifiedChapterIdx = new HashSet<Integer>();
for (int i = 0; i < this.beforeChapter; i++) {
modifiedChapterIdx.add(idx--);
}
}
}
//変換実行
chapterIdx = 0;
for (String chapterHref : chapterHrefs) {
if (this.canceled)
return null;
if (modifiedChapterIdx == null || modifiedChapterIdx.contains(chapterIdx)) {
//キャッシュファイル取得
String chapterPath = CharUtils.escapeUrlToFile(chapterHref.substring(chapterHref.indexOf("//") + 2));
File chapterCacheFile = new File(cachePath.getAbsolutePath() + "/" + chapterPath + (chapterPath.endsWith("/") ? "index.html" : ""));
//シリーズタイトルを出力
Document chapterDoc = Jsoup.parse(chapterCacheFile, null);
String chapterTitle = getExtractText(chapterDoc, this.queryMap.get(ExtractId.CONTENT_CHAPTER));
boolean newChapter = false;
if (chapterTitle != null && !preChapterTitle.equals(chapterTitle)) {
newChapter = true;
preChapterTitle = chapterTitle;
bw.append("\n[#改ページ]\n");
bw.append("[#ここから大見出し]\n");
printText(bw, preChapterTitle);
bw.append('\n');
bw.append("[#ここで大見出し終わり]\n");
bw.append('\n');
}
//更新日時を一覧から取得
String postDate = null;
if (postDateList != null && postDateList.length > chapterIdx) {
postDate = postDateList[chapterIdx];
}
String subTitle = null;
if (subtitles != null && subtitles.size() > chapterIdx)
subTitle = subtitles.get(chapterIdx);
docToAozoraText(bw, chapterDoc, newChapter, subTitle, postDate);
}
chapterIdx++;
}
//出力話数を表示
if (modifiedChapterIdx != null) {
StringBuilder buf = new StringBuilder();
int preIdx = -1;
boolean idxConnected = false;
//出力話数生成
for (int idx = 0; idx < chapterHrefs.size(); idx++) {
if (modifiedChapterIdx.contains(idx)) {
if (buf.length() == 0)
buf.append((idx + 1));
else {
if (preIdx == idx - 1) {
idxConnected = true;
} else {
if (idxConnected)
buf.append("-" + (preIdx + 1));
idxConnected = false;
buf.append("," + (idx));
}
}
preIdx = idx;
}
}
if (idxConnected)
buf.append("-" + (preIdx + 1));
LogAppender.println(buf + "話を変換します");
}
}
//底本にURL追加
bw.append("\n[#改ページ]\n");
bw.append("底本: ");
bw.append("<a href=\"");
bw.append(urlString);
bw.append("\">");
bw.append(urlString);
bw.append("</a>");
bw.append('\n');
bw.append("変換日時: ");
bw.append(dateFormat.format(new Date()));
bw.append('\n');
} finally {
bw.close();
}
this.canceled = false;
return txtFile;
}
use of org.jsoup.nodes.Element in project plaid by nickbutcher.
the class DribbbleSearchConverter method convert.
@Override
public List<Shot> convert(ResponseBody value) throws IOException {
final Elements shotElements = Jsoup.parse(value.string(), HOST).select("li[id^=screenshot]");
final List<Shot> shots = new ArrayList<>(shotElements.size());
for (Element element : shotElements) {
final Shot shot = parseShot(element, DATE_FORMAT);
if (shot != null) {
shots.add(shot);
}
}
return shots;
}
use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.
the class Readability method prepArticle.
/**
* Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous
* <p>
* tags, etc. This takes an element in, but returns a string.
*
* @param Element
* @return void
**/
private void prepArticle(Element articleContent) {
// we don't need to do this, we don't care
cleanStyles(articleContent);
// this replaces any break element or an nbsp with a plain break
// element.
// not needed. We will deal with breaks as we deal with breaks
// killBreaks(articleContent);
/* Clean out junk from the article content */
cleanConditionally(articleContent, "form");
clean(articleContent, "object");
clean(articleContent, "h1");
/**
* If there is only one h2, they are probably using it as a header and not a subheader, so remove it
* since we already have a header.
***/
if (articleContent.getElementsByTag("h2").size() == 1) {
clean(articleContent, "h2");
}
clean(articleContent, "iframe");
cleanHeaders(articleContent);
/*
* Do these last as the previous stuff may have removed junk that will affect these
*/
cleanConditionally(articleContent, "table");
cleanConditionally(articleContent, "ul");
//could have no children, will crash then
if (articleContent.children().size() != 0) {
cleanConditionally(articleContent.child(0), "div");
}
/* Remove extra paragraphs */
Elements articleParagraphs = articleContent.getElementsByTag("p");
for (Element para : articleParagraphs) {
int imgCount = para.getElementsByTag("img").size();
int embedCount = para.getElementsByTag("embed").size();
int objectCount = para.getElementsByTag("object").size();
if (imgCount == 0 && embedCount == 0 && objectCount == 0 && para.text().matches("\\s*")) {
para.remove();
}
}
Elements parasWithPreceedingBreaks = articleContent.getElementsByTag("br + p");
for (Element pe : parasWithPreceedingBreaks) {
Element brElement = pe.previousElementSibling();
brElement.remove();
}
}
use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.
the class Readability method init.
private void init() {
removeScripts();
convertNoscriptToDiv();
// there should never be more than one ... */
Elements bodies = document.getElementsByTag("body");
if (bodies.size() > 1) {
LOG.warn("More than one <body/>");
}
body = null;
body = bodies.get(0);
/*
* Make sure this document is added to the list of parsed pages first, so we don't double up on the
* first page
*/
parsedPages.add(normalizeTrailingSlash(givenUrl));
//respect the readAllPages flag, very important if a stringPage
if (readAllPages)
nextPageLink = findNextPageLink(body);
if (!notFirstPage) {
title = getArticleTitle();
}
prepDocument();
Element articleContent = grabArticle(null);
if (articleContent == null && !notFirstPage) {
// this happens when the content of the page is very short.
// we don't believe in super-short next pages.
articleText = body.text();
} else {
xmlImages.add(articleContent.outerHtml());
articleText = getDisplayText(articleContent);
}
}
use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.
the class Readability method getLinkDensity.
private double getLinkDensity(Element e) {
Elements links = e.getElementsByTag("a");
double textLength = e.text().length();
double linkLength = 0;
for (Element link : links) {
linkLength += link.text().length();
}
return linkLength / textLength;
}
Aggregations