use of com.biglybt.core.metasearch.SearchException in project BiglyBT by BiglySoftware.
the class WebEngine method getWebPageContentSupport.
private pageDetails getWebPageContentSupport(Proxy proxy, String proxy_host, String searchURL, SearchParameter[] searchParameters, Map<String, String> searchContext, String headers, boolean only_if_modified) throws SearchException {
try {
TorrentUtils.setTLSDescription("Search: " + getName());
if (requiresLogin()) {
throw new SearchLoginException("login required");
}
boolean vuze_file = searchURL.toLowerCase().startsWith("vuze:");
if (!vuze_file) {
String[] from_strs = new String[searchParameters.length];
String[] to_strs = new String[searchParameters.length];
for (int i = 0; i < searchParameters.length; i++) {
SearchParameter parameter = searchParameters[i];
from_strs[i] = "%" + parameter.getMatchPattern();
to_strs[i] = URLEncoder.encode(parameter.getValue(), "UTF-8");
}
searchURL = GeneralUtils.replaceAll(searchURL, from_strs, to_strs);
Iterator<Map.Entry<String, String>> it = searchContext.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, String> entry = it.next();
String key = entry.getKey();
if (supportsContext(key)) {
if (searchURL.indexOf('?') == -1) {
searchURL += "?";
} else {
searchURL += "&";
}
String value = entry.getValue();
searchURL += key + "=" + URLEncoder.encode(value, "UTF-8");
}
}
}
// System.out.println(searchURL);
// hack to support POST by encoding into URL
// http://xxxx/index.php?main=search&azmethod=post_basic:SearchString1=%s&SearchString=&search=Search
ResourceDownloaderFactory rdf = StaticUtilities.getResourceDownloaderFactory();
URL initial_url;
ResourceDownloader initial_url_rd;
int post_pos = searchURL.indexOf("azmethod=");
if (post_pos > 0) {
String post_params = searchURL.substring(post_pos + 9);
searchURL = searchURL.substring(0, post_pos - 1);
debugLog("search_url: " + searchURL + ", post=" + post_params);
initial_url = new URL(searchURL);
int sep = post_params.indexOf(':');
String type = post_params.substring(0, sep);
if (!type.equals("post_basic")) {
throw (new SearchException("Only basic type supported"));
}
post_params = post_params.substring(sep + 1);
if (proxy == null) {
initial_url_rd = rdf.create(initial_url, post_params);
} else {
initial_url_rd = rdf.create(initial_url, post_params, proxy);
}
initial_url_rd.setProperty("URL_Content-Type", "application/x-www-form-urlencoded");
} else {
debugLog("search_url: " + searchURL);
initial_url = new URL(searchURL);
if (proxy == null) {
initial_url_rd = rdf.create(initial_url);
} else {
initial_url_rd = rdf.create(initial_url, proxy);
}
}
if (proxy_host != null) {
initial_url_rd.setProperty("URL_HOST", proxy_host);
}
setHeaders(initial_url_rd, headers);
if (needsAuth && local_cookies != null) {
initial_url_rd.setProperty("URL_Cookie", local_cookies);
} else if (fullCookies != null && fullCookies.length() > 0) {
initial_url_rd.setProperty("URL_Cookie", fullCookies);
}
if (only_if_modified) {
String last_modified = getLocalString(LD_LAST_MODIFIED);
String etag = getLocalString(LD_ETAG);
if (last_modified != null) {
initial_url_rd.setProperty("URL_If-Modified-Since", last_modified);
}
if (etag != null) {
initial_url_rd.setProperty("URL_If-None-Match", etag);
}
}
InputStream is = null;
try {
String content_charset = "UTF-8";
ResourceDownloader mr_rd = null;
if (initial_url.getProtocol().equalsIgnoreCase("file")) {
// handle file://c:/ - map to file:/c:/
String str = initial_url.toExternalForm();
if (initial_url.getAuthority() != null) {
str = str.replaceFirst("://", ":/");
}
int pos = str.indexOf('?');
if (pos != -1) {
str = str.substring(0, pos);
}
is = new FileInputStream(new File(new URL(str).toURI()));
} else {
if (proxy == null) {
initial_url_rd.setProperty("URL_Connect_Timeout", 10 * 1000);
initial_url_rd.setProperty("URL_Read_Timeout", 10 * 1000);
}
mr_rd = rdf.getMetaRefreshDownloader(initial_url_rd);
try {
is = mr_rd.download();
} catch (ResourceDownloaderException e) {
Long response = (Long) mr_rd.getProperty("URL_HTTP_Response");
if (response != null && response.longValue() == 304) {
return (new pageDetails(initial_url, initial_url, ""));
} else {
throw (e);
}
}
if (needsAuth) {
List cookies_list = (List) mr_rd.getProperty("URL_Set-Cookie");
List cookies_set = new ArrayList();
if (cookies_list != null) {
for (int i = 0; i < cookies_list.size(); i++) {
String[] cookies = ((String) cookies_list.get(i)).split(";");
for (int j = 0; j < cookies.length; j++) {
String cookie = cookies[j].trim();
if (cookie.indexOf('=') != -1) {
cookies_set.add(cookie);
}
}
}
}
// well, not much we can do with the cookies anyway as in general the ones
// set are the ones missing/expired, not the existing ones. That is, we can't
// deduce anything from the fact that a required cookie is not 'set' here
// the most we could do is catch a server that explicitly deleted invalid
// cookies by expiring it, but I doubt this is a common practice.
// Also note the complexity of cookie syntax
// Set-Cookie: old standard using expires=, new using MaxAge
// Set-Cookie2:
// Maybe use http://jcookie.sourceforge.net/ if needed
}
if (only_if_modified) {
String last_modified = extractProperty(mr_rd.getProperty("URL_Last-Modified"));
String etag = extractProperty(mr_rd.getProperty("URL_ETag"));
if (last_modified != null) {
setLocalString(LD_LAST_MODIFIED, last_modified);
}
if (etag != null) {
setLocalString(LD_ETAG, etag);
}
}
List cts = (List) mr_rd.getProperty("URL_Content-Type");
if (cts != null && cts.size() > 0) {
String content_type = (String) cts.get(0);
int pos = content_type.toLowerCase().indexOf("charset");
if (pos != -1) {
content_type = content_type.substring(pos + 1);
pos = content_type.indexOf('=');
if (pos != -1) {
content_type = content_type.substring(pos + 1).trim();
pos = content_type.indexOf(';');
if (pos != -1) {
content_type = content_type.substring(0, pos).trim();
}
if (content_type.startsWith("\"")) {
content_type = content_type.substring(1).trim();
}
if (content_type.endsWith("\"")) {
content_type = content_type.substring(0, content_type.length() - 1).trim();
}
try {
if (Charset.isSupported(content_type)) {
debugLog("charset: " + content_type);
content_charset = content_type;
}
} catch (Throwable e) {
try {
// handle lowercase 'utf-8' for example
content_type = content_type.toUpperCase();
if (Charset.isSupported(content_type)) {
debugLog("charset: " + content_type);
content_charset = content_type;
}
} catch (Throwable f) {
log("Content type '" + content_type + "' not supported", f);
}
}
}
}
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream(8192);
byte[] buffer = new byte[8192];
while (true) {
int len = is.read(buffer);
if (len <= 0) {
break;
}
baos.write(buffer, 0, len);
}
byte[] data = baos.toByteArray();
if (vuze_file) {
try {
VuzeFileHandler vfh = VuzeFileHandler.getSingleton();
VuzeFile vf = vfh.loadVuzeFile(data);
vfh.handleFiles(new VuzeFile[] { vf }, VuzeFileComponent.COMP_TYPE_NONE);
} catch (Throwable e) {
Debug.out(e);
}
return (new pageDetails(initial_url, initial_url, null));
}
String page = null;
String content = new String(data, 0, Math.min(data.length, 2048), content_charset);
String lc_content = content.toLowerCase();
{
// first look for xml charset
// e.g. <?xml version="1.0" encoding="windows-1251" ?>
int pos1 = lc_content.indexOf("<?xml");
if (pos1 != -1) {
int pos2 = lc_content.indexOf("?>");
if (pos2 != -1) {
int pos3 = lc_content.indexOf("encoding", pos1);
if (pos3 != -1) {
pos3 = lc_content.indexOf("\"", pos3);
}
if (pos3 > pos1 && pos3 < pos2) {
pos3++;
int pos4 = lc_content.indexOf("\"", pos3);
if (pos4 > pos3 && pos4 < pos2) {
String encoding = content.substring(pos3, pos4).trim();
try {
if (Charset.isSupported(encoding)) {
debugLog("charset from xml tag: " + encoding);
content_charset = encoding;
// some feeds have crap at the start which makes pos2 mismatch for the above '?' - adjust if necessary
int data_start = pos2;
int max_skip = 64;
while (data[data_start] != '?' && max_skip-- > 0) {
data_start++;
}
page = content.substring(0, pos3) + "utf-8" + content.substring(pos4, pos2) + new String(data, data_start, data.length - data_start, content_charset);
}
} catch (Throwable e) {
log("Content type '" + encoding + "' not supported", e);
}
}
}
}
}
}
if (page == null) {
// next look for http-equiv charset
// e.g. <meta http-equiv="Content-Type" content="text/html; charset=windows-1251" />
int pos = 0;
while (true) {
int pos1 = lc_content.indexOf("http-equiv", pos);
if (pos1 != -1) {
int pos2 = lc_content.indexOf(">", pos1);
if (pos2 != -1) {
int pos3 = lc_content.indexOf("charset", pos1);
if (pos3 != -1 && pos3 < pos2) {
pos3 = lc_content.indexOf("=", pos3);
if (pos3 != -1) {
pos3++;
int pos4 = lc_content.indexOf("\"", pos3);
if (pos4 != -1) {
int pos5 = lc_content.indexOf(";", pos3);
if (pos5 != -1 && pos5 < pos4) {
pos4 = pos5;
}
String encoding = content.substring(pos3, pos4).trim();
try {
if (Charset.isSupported(encoding)) {
debugLog("charset from http-equiv : " + encoding);
content_charset = encoding;
// some feeds have crap at the start which makes pos2 mismatch for the above '?' - adjust if necessary
int data_start = pos2;
int max_skip = 64;
while (data[data_start] != '?' && max_skip-- > 0) {
data_start++;
}
page = content.substring(0, pos3) + "utf-8" + content.substring(pos4, pos2) + new String(data, data_start, data.length - data_start, content_charset);
}
} catch (Throwable e) {
log("Content type '" + encoding + "' not supported", e);
}
break;
}
}
}
pos = pos2;
} else {
break;
}
} else {
break;
}
}
}
if (page == null) {
page = new String(data, content_charset);
}
debugLog("page:");
debugLog(page);
try {
Matcher m = baseTagPattern.matcher(page);
if (m.find()) {
basePage = m.group(1);
debugLog("base_page: " + basePage);
}
} catch (Exception e) {
// No BASE tag in the page
}
URL final_url = initial_url;
if (mr_rd != null) {
URL x = (URL) mr_rd.getProperty("URL_URL");
if (x != null) {
final_url = x;
}
}
return (new pageDetails(initial_url, final_url, page));
} finally {
if (is != null) {
is.close();
}
}
} catch (SearchException e) {
throw (e);
} catch (Throwable e) {
// e.printStackTrace();
debugLog("Failed to load page: " + Debug.getNestedExceptionMessageAndStack(e));
throw (new SearchException("Failed to load page", e));
} finally {
TorrentUtils.setTLSDescription(null);
}
}
use of com.biglybt.core.metasearch.SearchException in project BiglyBT by BiglySoftware.
the class WebEngine method getWebPageContent.
protected pageDetails getWebPageContent(SearchParameter[] searchParameters, Map<String, String> searchContext, String headers, boolean only_if_modified, pageDetailsVerifier verifier) throws SearchException {
String searchURL = searchURLFormat;
String lc_url = searchURL.toLowerCase(Locale.US);
boolean explicit_tor = lc_url.startsWith("tor:");
boolean user_tor = false;
if (!explicit_tor) {
String test = Result.adjustLink(searchURL);
if (test.startsWith("tor:")) {
user_tor = true;
}
}
if (explicit_tor || user_tor) {
// strip out any stuff we probably don't want to send
searchContext = new HashMap<>();
String target_resource = explicit_tor ? searchURL.substring(4) : searchURL;
URL location;
try {
location = new URL(target_resource);
} catch (MalformedURLException e) {
throw (new SearchException(e));
}
Map<String, Object> options = new HashMap<>();
options.put(AEProxyFactory.PO_PEER_NETWORKS, new String[] { AENetworkClassifier.AT_TOR });
PluginProxy plugin_proxy = AEProxyFactory.getPluginProxy("Web engine download of '" + target_resource + "'", location, options, true);
if (plugin_proxy == null) {
throw (new SearchException("No Tor plugin proxy available for '" + target_resource + "'"));
}
URL url = plugin_proxy.getURL();
Proxy proxy = plugin_proxy.getProxy();
boolean ok = false;
try {
String proxy_host = location.getHost() + (location.getPort() == -1 ? "" : (":" + location.getPort()));
pageDetails details = getWebPageContentSupport(proxy, proxy_host, url.toExternalForm(), searchParameters, searchContext, headers, only_if_modified);
if (verifier != null) {
verifier.verify(details);
}
ok = true;
return (details);
} finally {
plugin_proxy.setOK(ok);
}
}
try {
try {
URL url = new URL(searchURL);
if (AENetworkClassifier.categoriseAddress(url.getHost()) != AENetworkClassifier.AT_PUBLIC) {
// strip out any stuff we probably don't want to send
searchContext = new HashMap<>();
}
} catch (Throwable e) {
}
pageDetails details = getWebPageContentSupport(null, null, searchURL, searchParameters, searchContext, headers, only_if_modified);
if (verifier != null) {
verifier.verify(details);
}
return (details);
} catch (SearchException e) {
try {
URL original_url = new URL(searchURL);
PluginProxy plugin_proxy = AEProxyFactory.getPluginProxy("getting search results ", original_url);
if (plugin_proxy == null) {
throw (e);
} else {
URL url = plugin_proxy.getURL();
Proxy proxy = plugin_proxy.getProxy();
boolean ok = false;
try {
String proxy_host = original_url.getHost() + (original_url.getPort() == -1 ? "" : (":" + original_url.getPort()));
pageDetails details = getWebPageContentSupport(proxy, proxy_host, url.toExternalForm(), searchParameters, searchContext, headers, only_if_modified);
if (verifier != null) {
verifier.verify(details);
}
ok = true;
return (details);
} finally {
plugin_proxy.setOK(ok);
}
}
} catch (Throwable f) {
throw (e);
}
}
}
Aggregations