Search in sources :

Example 21 with Element

use of org.jsoup.nodes.Element in project AozoraEpub3 by hmdev.

the class WebAozoraConverter method convertToAozoraText.

////////////////////////////////////////////////////////////////
/** 変換実行
	 * @param urlString
	 * @param cachePath
	 * @param interval
	 * @param modifiedExpire この時間以内のキャッシュを更新分として扱う
	 * @param convertUpdated 更新時のみ出力
	 * @param convertModifiedOnly 追加更新分のみ変換
	 * @param convertModifiedTail 最新話から連続したもののみ変換
	 * @param beforeChapter 指定話数のみ変換 0は指定無し
	 * @return 変換スキップやキャンセルならnullを返す */
public File convertToAozoraText(String urlString, File cachePath, int interval, float modifiedExpire, boolean convertUpdated, boolean convertModifiedOnly, boolean convertModifiedTail, int beforeChapter) throws IOException {
    this.canceled = false;
    //日付一覧が取得できない場合は常に更新
    this.updated = true;
    this.interval = Math.max(500, interval);
    this.modifiedExpire = Math.max(0, modifiedExpire);
    this.convertUpdated = convertUpdated;
    this.convertModifiedOnly = convertModifiedOnly;
    this.convertModifiedTail = convertModifiedTail;
    this.beforeChapter = beforeChapter;
    //末尾の / をリダイレクトで取得
    urlString = urlString.trim();
    if (!urlString.endsWith("/") && !urlString.endsWith(".html") && !urlString.endsWith(".htm") && urlString.indexOf("?") == -1) {
        HttpURLConnection connection = null;
        try {
            connection = (HttpURLConnection) new URL(urlString + "/").openConnection();
            if (connection.getResponseCode() == 200) {
                urlString += "/";
                LogAppender.println("URL修正 : " + urlString);
            }
        } catch (Exception e) {
        } finally {
            if (connection != null)
                connection.disconnect();
        }
    }
    this.urlString = urlString;
    this.baseUri = urlString.substring(0, urlString.indexOf('/', urlString.indexOf("//") + 2));
    //String fqdn = baseUri.substring(baseUri.indexOf("//")+2);
    String listBaseUrl = urlString.substring(0, urlString.lastIndexOf('/') + 1);
    this.pageBaseUri = listBaseUrl;
    //http://を除外
    String urlFilePath = CharUtils.escapeUrlToFile(urlString.substring(urlString.indexOf("//") + 2));
    //http://を除外した文字列で比較
    /*ExtractInfo[] extractInfos = this.queryMap.get(ExtractId.PAGE_REGEX);
		if(extractInfos != null) {
			if (!extractInfos[0].matches(urlString)) {
				LogAppender.println("読み込み可能なURLではありません");
				return null;
			}
		}*/
    String urlParentPath = urlFilePath;
    boolean isPath = false;
    if (urlFilePath.endsWith("/")) {
        isPath = true;
        urlFilePath += "index.html";
    } else
        urlParentPath = urlFilePath.substring(0, urlFilePath.lastIndexOf('/') + 1);
    //変換結果
    this.dstPath = cachePath.getAbsolutePath() + "/";
    if (isPath)
        this.dstPath += urlParentPath;
    else
        this.dstPath += urlFilePath + "_converted/";
    File txtFile = new File(this.dstPath + "converted.txt");
    //表紙画像はtxtと同じ名前で保存 拡張子はpngだが表示はできるのでそのまま
    File coverImageFile = new File(this.dstPath + "converted.png");
    //更新情報格納先
    File updateInfoFile = new File(this.dstPath + "update.txt");
    //フォルダ以外がすでにあったら削除
    File parentFile = txtFile.getParentFile();
    if (parentFile.exists() && !parentFile.isDirectory()) {
        parentFile.delete();
    }
    parentFile.mkdirs();
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFile), "UTF-8"));
    try {
        //urlStringのファイルをキャッシュ
        File cacheFile = new File(cachePath.getAbsolutePath() + "/" + urlFilePath);
        try {
            LogAppender.append(urlString);
            cacheFile(urlString, cacheFile, null);
            LogAppender.println(" : List Loaded.");
        } catch (Exception e) {
            e.printStackTrace();
            LogAppender.println("一覧ページの取得に失敗しました。 ");
            if (!cacheFile.exists())
                return null;
            LogAppender.println("キャッシュファイルを利用します。");
        }
        //パスならlist.txtの情報を元にキャッシュ後に青空txt変換して改ページで繋げて出力
        Document doc = Jsoup.parse(cacheFile, null);
        //表紙画像
        Elements images = getExtractElements(doc, this.queryMap.get(ExtractId.COVER_IMG));
        if (images != null) {
            printImage(null, images.get(0), coverImageFile);
        }
        //タイトル
        boolean hasTitle = false;
        String series = getExtractText(doc, this.queryMap.get(ExtractId.SERIES));
        if (series != null) {
            printText(bw, series);
            bw.append('\n');
            hasTitle = true;
        }
        String title = getExtractText(doc, this.queryMap.get(ExtractId.TITLE));
        if (title != null) {
            printText(bw, title);
            bw.append('\n');
            hasTitle = true;
        }
        if (!hasTitle) {
            LogAppender.println("SERIES/TITLE : タイトルがありません");
            return null;
        }
        //著者
        String author = getExtractText(doc, this.queryMap.get(ExtractId.AUTHOR));
        if (author != null) {
            printText(bw, author);
        }
        bw.append('\n');
        //説明
        Element description = getExtractFirstElement(doc, this.queryMap.get(ExtractId.DESCRIPTION));
        if (description != null) {
            bw.append('\n');
            bw.append("[#区切り線]\n");
            bw.append('\n');
            bw.append("[#ここから2字下げ]\n");
            bw.append("[#ここから2字上げ]\n");
            printNode(bw, description, true);
            bw.append('\n');
            bw.append("[#ここで字上げ終わり]\n");
            bw.append("[#ここで字下げ終わり]\n");
            bw.append('\n');
            bw.append("[#区切り線]\n");
            bw.append('\n');
        }
        String contentsUpdate = getExtractText(doc, this.queryMap.get(ExtractId.UPDATE));
        //章名称 変わった場合に出力
        String preChapterTitle = "";
        //各話のURL(フルパス)を格納
        Vector<String> chapterHrefs = new Vector<String>();
        Elements hrefs = getExtractElements(doc, this.queryMap.get(ExtractId.HREF));
        if (hrefs == null && this.queryMap.containsKey(ExtractId.HREF)) {
            LogAppender.println("HREF : 各話のリンク先URLが取得できません");
        }
        Vector<String> subtitles = getExtractStrings(doc, this.queryMap.get(ExtractId.SUBTITLE_LIST), true);
        if (subtitles == null && this.queryMap.containsKey(ExtractId.SUBTITLE_LIST)) {
            LogAppender.println("SUBTITLE_LIST : 各話タイトルが取得できません");
        }
        //更新のない各話のURL(フルパス)を格納
        //nullならキャッシュ更新無しで、空ならすべて更新される
        HashSet<String> noUpdateUrls = null;
        String[] postDateList = null;
        if (hrefs == null) {
            //ページ番号取得
            String pageNumString = getExtractText(doc, this.queryMap.get(ExtractId.PAGE_NUM));
            if (pageNumString == null && this.queryMap.containsKey(ExtractId.PAGE_NUM)) {
                LogAppender.println("PAGE_NUM : ページ数が取得できません");
            }
            int pageNum = -1;
            try {
                pageNum = Integer.parseInt(pageNumString);
            } catch (Exception e) {
            }
            Element pageUrlElement = getExtractFirstElement(doc, this.queryMap.get(ExtractId.PAGE_URL));
            if (pageUrlElement == null && this.queryMap.containsKey(ExtractId.PAGE_URL)) {
                LogAppender.println("PAGE_URL : ページ番号用のURLが取得できません");
            }
            if (pageNum > 0 && pageUrlElement != null) {
                ExtractInfo pageUrlExtractInfo = this.queryMap.get(ExtractId.PAGE_URL)[0];
                //リンク生成 1~ページ番号まで
                for (int i = 1; i <= pageNum; i++) {
                    String pageUrl = pageUrlElement.attr("href");
                    if (pageUrl != null) {
                        pageUrl = pageUrlExtractInfo.replace(pageUrl + "\t" + i);
                        if (pageUrl != null) {
                            if (!pageUrl.startsWith("http")) {
                                if (pageUrl.charAt(0) == '/')
                                    pageUrl = baseUri + pageUrl;
                                else
                                    pageUrl = listBaseUrl + pageUrl;
                            }
                            chapterHrefs.add(pageUrl);
                        }
                    }
                }
            } else {
                Elements contentDivs = getExtractElements(doc, this.queryMap.get(ExtractId.CONTENT_ARTICLE));
                if (contentDivs != null) {
                    //一覧のリンクはないが本文がある場合
                    docToAozoraText(bw, doc, false, null, null);
                } else {
                    LogAppender.println("一覧のURLが取得できませんでした");
                    return null;
                }
            }
        } else {
            //更新分のみ取得するようにするためhrefに対応した日付タグの文字列(innerHTML)を取得して保存しておく
            Elements updates = getExtractElements(doc, this.queryMap.get(ExtractId.SUB_UPDATE));
            if (updates == null && this.queryMap.containsKey(ExtractId.SUB_UPDATE)) {
                LogAppender.println("SUB_UPDATE : 更新確認情報が取得できません");
            }
            if (updates != null) {
                //更新しないURLのチェック用
                noUpdateUrls = createNoUpdateUrls(updateInfoFile, urlString, listBaseUrl, contentsUpdate, hrefs, updates);
            }
            //一覧のhrefをすべて取得
            for (Element href : hrefs) {
                String hrefString = href.attr("href");
                if (hrefString == null || hrefString.length() == 0)
                    continue;
                //パターンがあればマッチング
                ExtractInfo extractInfo = this.queryMap.get(ExtractId.HREF)[0];
                if (!extractInfo.hasPattern() || extractInfo.matches(hrefString)) {
                    String chapterHref = hrefString;
                    if (!hrefString.startsWith("http")) {
                        if (hrefString.charAt(0) == '/')
                            chapterHref = baseUri + hrefString;
                        else
                            chapterHref = listBaseUrl + hrefString;
                    }
                    chapterHrefs.add(chapterHref);
                }
            }
            postDateList = getPostDateList(doc, this.queryMap.get(ExtractId.CONTENT_UPDATE_LIST));
            if (postDateList == null && this.queryMap.containsKey(ExtractId.CONTENT_UPDATE_LIST)) {
                LogAppender.println("CONTENT_UPDATE_LIST : 一覧ページの更新日時情報が取得できません");
            }
        }
        if (chapterHrefs.size() > 0) {
            //全話で更新や追加があるかチェック
            updated = false;
            //追加更新対象の期限 これより大きければ追加更新
            long expire = System.currentTimeMillis() - (long) (this.modifiedExpire * 3600000);
            //追加更新分のみ出力時に利用
            HashSet<Integer> modifiedChapterIdx = null;
            //更新されていない最後の話数 0~
            int lastNoModifiedChapterIdx = -1;
            if (this.convertModifiedOnly) {
                modifiedChapterIdx = new HashSet<Integer>();
            }
            int chapterIdx = 0;
            for (String chapterHref : chapterHrefs) {
                if (this.canceled)
                    return null;
                if (chapterHref != null && chapterHref.length() > 0) {
                    //画像srcをフルパスにするときに使うページのパス
                    this.pageBaseUri = chapterHref;
                    if (!chapterHref.endsWith("/")) {
                        int idx = chapterHref.indexOf('/', 7);
                        if (idx > -1)
                            this.pageBaseUri = chapterHref.substring(0, idx);
                    }
                    //キャッシュ取得 ロードされたらWait 500ms
                    String chapterPath = CharUtils.escapeUrlToFile(chapterHref.substring(chapterHref.indexOf("//") + 2));
                    File chapterCacheFile = new File(cachePath.getAbsolutePath() + "/" + chapterPath + (chapterPath.endsWith("/") ? "index.html" : ""));
                    //hrefsのときは更新分のみurlsに入っている
                    boolean loaded = false;
                    //更新対象ならtrueに変更
                    boolean reload = false;
                    //nullでなく更新無しに含まれなければ再読込
                    if (noUpdateUrls != null && !noUpdateUrls.contains(chapterHref))
                        reload = true;
                    if (reload || !chapterCacheFile.exists()) {
                        LogAppender.append("[" + (chapterIdx + 1) + "/" + chapterHrefs.size() + "] " + chapterHref);
                        try {
                            try {
                                Thread.sleep(this.interval);
                            } catch (InterruptedException e) {
                            }
                            cacheFile(chapterHref, chapterCacheFile, urlString);
                            LogAppender.println(" : Loaded.");
                            //ファイルがロードされたら更新有り
                            this.updated = true;
                            loaded = true;
                        } catch (Exception e) {
                            e.printStackTrace();
                            LogAppender.println("htmlファイルが取得できませんでした : " + chapterHref);
                        }
                    }
                    //キャッシュされているファイルが指定時間内なら更新扱い
                    if (!loaded) {
                        if (this.modifiedExpire > 0 && (this.convertModifiedOnly || this.convertUpdated) && chapterCacheFile.lastModified() >= expire) {
                            LogAppender.append("[" + (chapterIdx + 1) + "/" + chapterHrefs.size() + "] " + chapterHref);
                            LogAppender.println(" : Modified.");
                            this.updated = true;
                        }
                    }
                    //更新分のみ出力時のチェック
                    if (this.convertModifiedOnly) {
                        //ファイルの更新日時で比較
                        if (chapterCacheFile.lastModified() >= expire) {
                            modifiedChapterIdx.add(chapterIdx);
                        } else {
                            if (this.convertModifiedTail) {
                                //最新から連続していない話は除外
                                modifiedChapterIdx.clear();
                            }
                            lastNoModifiedChapterIdx = chapterIdx;
                        }
                    }
                }
                chapterIdx++;
            }
            //更新が無くて変換もなければ終了
            if (!this.updated) {
                LogAppender.append("「" + title + "」");
                LogAppender.println("の更新はありません");
                if (this.convertUpdated)
                    return null;
            }
            if (this.convertModifiedOnly) {
                //更新前の話数を追加 昇順で重複もはじく
                if (this.beforeChapter > 0) {
                    int startIdx = Math.max(0, lastNoModifiedChapterIdx - this.beforeChapter + 1);
                    if (modifiedChapterIdx.size() == 0) {
                        //追加分なし
                        int idx = chapterHrefs.size() - 1;
                        for (int i = 0; i < this.beforeChapter; i++) {
                            modifiedChapterIdx.add(idx--);
                        }
                    } else {
                        //追加分あり
                        for (int i = startIdx; i <= lastNoModifiedChapterIdx; i++) {
                            modifiedChapterIdx.add(i);
                        }
                    }
                }
                if (modifiedChapterIdx.size() == 0) {
                    LogAppender.println("追加更新分はありません");
                    this.updated = false;
                    return null;
                }
            } else {
                //最新話数指定
                if (this.beforeChapter > 0) {
                    int idx = chapterHrefs.size() - 1;
                    modifiedChapterIdx = new HashSet<Integer>();
                    for (int i = 0; i < this.beforeChapter; i++) {
                        modifiedChapterIdx.add(idx--);
                    }
                }
            }
            //変換実行
            chapterIdx = 0;
            for (String chapterHref : chapterHrefs) {
                if (this.canceled)
                    return null;
                if (modifiedChapterIdx == null || modifiedChapterIdx.contains(chapterIdx)) {
                    //キャッシュファイル取得
                    String chapterPath = CharUtils.escapeUrlToFile(chapterHref.substring(chapterHref.indexOf("//") + 2));
                    File chapterCacheFile = new File(cachePath.getAbsolutePath() + "/" + chapterPath + (chapterPath.endsWith("/") ? "index.html" : ""));
                    //シリーズタイトルを出力
                    Document chapterDoc = Jsoup.parse(chapterCacheFile, null);
                    String chapterTitle = getExtractText(chapterDoc, this.queryMap.get(ExtractId.CONTENT_CHAPTER));
                    boolean newChapter = false;
                    if (chapterTitle != null && !preChapterTitle.equals(chapterTitle)) {
                        newChapter = true;
                        preChapterTitle = chapterTitle;
                        bw.append("\n[#改ページ]\n");
                        bw.append("[#ここから大見出し]\n");
                        printText(bw, preChapterTitle);
                        bw.append('\n');
                        bw.append("[#ここで大見出し終わり]\n");
                        bw.append('\n');
                    }
                    //更新日時を一覧から取得
                    String postDate = null;
                    if (postDateList != null && postDateList.length > chapterIdx) {
                        postDate = postDateList[chapterIdx];
                    }
                    String subTitle = null;
                    if (subtitles != null && subtitles.size() > chapterIdx)
                        subTitle = subtitles.get(chapterIdx);
                    docToAozoraText(bw, chapterDoc, newChapter, subTitle, postDate);
                }
                chapterIdx++;
            }
            //出力話数を表示
            if (modifiedChapterIdx != null) {
                StringBuilder buf = new StringBuilder();
                int preIdx = -1;
                boolean idxConnected = false;
                //出力話数生成
                for (int idx = 0; idx < chapterHrefs.size(); idx++) {
                    if (modifiedChapterIdx.contains(idx)) {
                        if (buf.length() == 0)
                            buf.append((idx + 1));
                        else {
                            if (preIdx == idx - 1) {
                                idxConnected = true;
                            } else {
                                if (idxConnected)
                                    buf.append("-" + (preIdx + 1));
                                idxConnected = false;
                                buf.append("," + (idx));
                            }
                        }
                        preIdx = idx;
                    }
                }
                if (idxConnected)
                    buf.append("-" + (preIdx + 1));
                LogAppender.println(buf + "話を変換します");
            }
        }
        //底本にURL追加
        bw.append("\n[#改ページ]\n");
        bw.append("底本: ");
        bw.append("<a href=\"");
        bw.append(urlString);
        bw.append("\">");
        bw.append(urlString);
        bw.append("</a>");
        bw.append('\n');
        bw.append("変換日時: ");
        bw.append(dateFormat.format(new Date()));
        bw.append('\n');
    } finally {
        bw.close();
    }
    this.canceled = false;
    return txtFile;
}
Also used : Element(org.jsoup.nodes.Element) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) URL(java.net.URL) IOException(java.io.IOException) Date(java.util.Date) BufferedWriter(java.io.BufferedWriter) HttpURLConnection(java.net.HttpURLConnection) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) Vector(java.util.Vector)

Example 22 with Element

use of org.jsoup.nodes.Element in project plaid by nickbutcher.

the class DribbbleSearchConverter method convert.

@Override
public List<Shot> convert(ResponseBody value) throws IOException {
    final Elements shotElements = Jsoup.parse(value.string(), HOST).select("li[id^=screenshot]");
    final List<Shot> shots = new ArrayList<>(shotElements.size());
    for (Element element : shotElements) {
        final Shot shot = parseShot(element, DATE_FORMAT);
        if (shot != null) {
            shots.add(shot);
        }
    }
    return shots;
}
Also used : Element(org.jsoup.nodes.Element) ArrayList(java.util.ArrayList) Elements(org.jsoup.select.Elements) Shot(io.plaidapp.data.api.dribbble.model.Shot)

Example 23 with Element

use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.

the class Readability method prepArticle.

/**
     * Prepare the article node for display. Clean out any inline styles, iframes, forms, strip extraneous
     * <p>
     * tags, etc. This takes an element in, but returns a string.
     * 
     * @param Element
     * @return void
     **/
private void prepArticle(Element articleContent) {
    // we don't need to do this, we don't care
    cleanStyles(articleContent);
    // this replaces any break element or an nbsp with a plain break
    // element.
    // not needed. We will deal with breaks as we deal with breaks
    // killBreaks(articleContent);
    /* Clean out junk from the article content */
    cleanConditionally(articleContent, "form");
    clean(articleContent, "object");
    clean(articleContent, "h1");
    /**
         * If there is only one h2, they are probably using it as a header and not a subheader, so remove it
         * since we already have a header.
         ***/
    if (articleContent.getElementsByTag("h2").size() == 1) {
        clean(articleContent, "h2");
    }
    clean(articleContent, "iframe");
    cleanHeaders(articleContent);
    /*
         * Do these last as the previous stuff may have removed junk that will affect these
         */
    cleanConditionally(articleContent, "table");
    cleanConditionally(articleContent, "ul");
    //could have no children, will crash then
    if (articleContent.children().size() != 0) {
        cleanConditionally(articleContent.child(0), "div");
    }
    /* Remove extra paragraphs */
    Elements articleParagraphs = articleContent.getElementsByTag("p");
    for (Element para : articleParagraphs) {
        int imgCount = para.getElementsByTag("img").size();
        int embedCount = para.getElementsByTag("embed").size();
        int objectCount = para.getElementsByTag("object").size();
        if (imgCount == 0 && embedCount == 0 && objectCount == 0 && para.text().matches("\\s*")) {
            para.remove();
        }
    }
    Elements parasWithPreceedingBreaks = articleContent.getElementsByTag("br + p");
    for (Element pe : parasWithPreceedingBreaks) {
        Element brElement = pe.previousElementSibling();
        brElement.remove();
    }
}
Also used : Element(org.jsoup.nodes.Element) Elements(org.jsoup.select.Elements)

Example 24 with Element

use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.

the class Readability method init.

private void init() {
    removeScripts();
    convertNoscriptToDiv();
    // there should never be more than one ... */
    Elements bodies = document.getElementsByTag("body");
    if (bodies.size() > 1) {
        LOG.warn("More than one <body/>");
    }
    body = null;
    body = bodies.get(0);
    /*
         * Make sure this document is added to the list of parsed pages first, so we don't double up on the
         * first page
         */
    parsedPages.add(normalizeTrailingSlash(givenUrl));
    //respect the readAllPages flag, very important if a stringPage
    if (readAllPages)
        nextPageLink = findNextPageLink(body);
    if (!notFirstPage) {
        title = getArticleTitle();
    }
    prepDocument();
    Element articleContent = grabArticle(null);
    if (articleContent == null && !notFirstPage) {
        // this happens when the content of the page is very short.
        // we don't believe in super-short next pages.
        articleText = body.text();
    } else {
        xmlImages.add(articleContent.outerHtml());
        articleText = getDisplayText(articleContent);
    }
}
Also used : Element(org.jsoup.nodes.Element) Elements(org.jsoup.select.Elements)

Example 25 with Element

use of org.jsoup.nodes.Element in project Java-readability by basis-technology-corp.

the class Readability method getLinkDensity.

private double getLinkDensity(Element e) {
    Elements links = e.getElementsByTag("a");
    double textLength = e.text().length();
    double linkLength = 0;
    for (Element link : links) {
        linkLength += link.text().length();
    }
    return linkLength / textLength;
}
Also used : Element(org.jsoup.nodes.Element) Elements(org.jsoup.select.Elements)

Aggregations

Element (org.jsoup.nodes.Element)343 Document (org.jsoup.nodes.Document)152 Elements (org.jsoup.select.Elements)95 ElementHandlerImpl (org.asqatasun.ruleimplementation.ElementHandlerImpl)87 IOException (java.io.IOException)63 File (java.io.File)62 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)34 TestSolutionHandler (org.asqatasun.ruleimplementation.TestSolutionHandler)21 URL (java.net.URL)15 TestSolutionHandlerImpl (org.asqatasun.ruleimplementation.TestSolutionHandlerImpl)15 SimpleElementSelector (org.asqatasun.rules.elementselector.SimpleElementSelector)13 TestSolution (org.asqatasun.entity.audit.TestSolution)11 HashMap (java.util.HashMap)9 ElementSelector (org.asqatasun.rules.elementselector.ElementSelector)9 Node (org.jsoup.nodes.Node)9 InputStream (java.io.InputStream)8 EvidenceElement (org.asqatasun.entity.audit.EvidenceElement)8 SSPHandler (org.asqatasun.processor.SSPHandler)7 ProcessRemarkService (org.asqatasun.service.ProcessRemarkService)7