Search in sources :

Example 1 with Result

use of com.google.refine.importing.UrlRewriter.Result in project OpenRefine by OpenRefine.

the class ImportingUtilities method retrieveContentFromPostRequest.

public static void retrieveContentFromPostRequest(HttpServletRequest request, Properties parameters, File rawDataDir, JSONObject retrievalRecord, final Progress progress) throws Exception {
    JSONArray fileRecords = new JSONArray();
    JSONUtilities.safePut(retrievalRecord, "files", fileRecords);
    int clipboardCount = 0;
    int uploadCount = 0;
    int downloadCount = 0;
    int archiveCount = 0;
    // This tracks the total progress, which involves uploading data from the client
    // as well as downloading data from URLs.
    final SavingUpdate update = new SavingUpdate() {

        @Override
        public void savedMore() {
            progress.setProgress(null, calculateProgressPercent(totalExpectedSize, totalRetrievedSize));
        }

        @Override
        public boolean isCanceled() {
            return progress.isCanceled();
        }
    };
    DiskFileItemFactory fileItemFactory = new DiskFileItemFactory();
    ServletFileUpload upload = new ServletFileUpload(fileItemFactory);
    upload.setProgressListener(new ProgressListener() {

        boolean setContentLength = false;

        long lastBytesRead = 0;

        @Override
        public void update(long bytesRead, long contentLength, int itemCount) {
            if (!setContentLength) {
                // Only try to set the content length if we really know it.
                if (contentLength >= 0) {
                    update.totalExpectedSize += contentLength;
                    setContentLength = true;
                }
            }
            if (setContentLength) {
                update.totalRetrievedSize += (bytesRead - lastBytesRead);
                lastBytesRead = bytesRead;
                update.savedMore();
            }
        }
    });
    @SuppressWarnings("unchecked") List<FileItem> tempFiles = (List<FileItem>) upload.parseRequest(request);
    progress.setProgress("Uploading data ...", -1);
    parts: for (FileItem fileItem : tempFiles) {
        if (progress.isCanceled()) {
            break;
        }
        InputStream stream = fileItem.getInputStream();
        String name = fileItem.getFieldName().toLowerCase();
        if (fileItem.isFormField()) {
            if (name.equals("clipboard")) {
                String encoding = request.getCharacterEncoding();
                if (encoding == null) {
                    encoding = "UTF-8";
                }
                File file = allocateFile(rawDataDir, "clipboard.txt");
                JSONObject fileRecord = new JSONObject();
                JSONUtilities.safePut(fileRecord, "origin", "clipboard");
                JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
                JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null);
                JSONUtilities.safePut(fileRecord, "format", "text");
                JSONUtilities.safePut(fileRecord, "fileName", "(clipboard)");
                JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
                progress.setProgress("Uploading pasted clipboard text", calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
                JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
                JSONUtilities.append(fileRecords, fileRecord);
                clipboardCount++;
            } else if (name.equals("download")) {
                String urlString = Streams.asString(stream);
                URL url = new URL(urlString);
                JSONObject fileRecord = new JSONObject();
                JSONUtilities.safePut(fileRecord, "origin", "download");
                JSONUtilities.safePut(fileRecord, "url", urlString);
                for (UrlRewriter rewriter : ImportingManager.urlRewriters) {
                    Result result = rewriter.rewrite(urlString);
                    if (result != null) {
                        urlString = result.rewrittenUrl;
                        url = new URL(urlString);
                        JSONUtilities.safePut(fileRecord, "url", urlString);
                        JSONUtilities.safePut(fileRecord, "format", result.format);
                        if (!result.download) {
                            downloadCount++;
                            JSONUtilities.append(fileRecords, fileRecord);
                            continue parts;
                        }
                    }
                }
                if ("http".equals(url.getProtocol()) || "https".equals(url.getProtocol())) {
                    DefaultHttpClient client = new DefaultHttpClient();
                    DecompressingHttpClient httpclient = new DecompressingHttpClient(client);
                    HttpGet httpGet = new HttpGet(url.toURI());
                    httpGet.setHeader("User-Agent", RefineServlet.getUserAgent());
                    if ("https".equals(url.getProtocol())) {
                        // HTTPS only - no sending password in the clear over HTTP
                        String userinfo = url.getUserInfo();
                        if (userinfo != null) {
                            int s = userinfo.indexOf(':');
                            if (s > 0) {
                                String user = userinfo.substring(0, s);
                                String pw = userinfo.substring(s + 1, userinfo.length());
                                client.getCredentialsProvider().setCredentials(new AuthScope(url.getHost(), 443), new UsernamePasswordCredentials(user, pw));
                            }
                        }
                    }
                    HttpResponse response = httpclient.execute(httpGet);
                    try {
                        response.getStatusLine();
                        HttpEntity entity = response.getEntity();
                        if (entity == null) {
                            throw new Exception("No content found in " + url.toString());
                        }
                        InputStream stream2 = entity.getContent();
                        String encoding = null;
                        if (entity.getContentEncoding() != null) {
                            encoding = entity.getContentEncoding().getValue();
                        }
                        JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
                        String contentType = null;
                        if (entity.getContentType() != null) {
                            contentType = entity.getContentType().getValue();
                        }
                        JSONUtilities.safePut(fileRecord, "declaredMimeType", contentType);
                        if (saveStream(stream2, url, rawDataDir, progress, update, fileRecord, fileRecords, entity.getContentLength())) {
                            archiveCount++;
                        }
                        downloadCount++;
                        EntityUtils.consume(entity);
                    } finally {
                        httpGet.releaseConnection();
                    }
                } else {
                    // Fallback handling for non HTTP connections (only FTP?)
                    URLConnection urlConnection = url.openConnection();
                    urlConnection.setConnectTimeout(5000);
                    urlConnection.connect();
                    InputStream stream2 = urlConnection.getInputStream();
                    JSONUtilities.safePut(fileRecord, "declaredEncoding", urlConnection.getContentEncoding());
                    JSONUtilities.safePut(fileRecord, "declaredMimeType", urlConnection.getContentType());
                    try {
                        if (saveStream(stream2, url, rawDataDir, progress, update, fileRecord, fileRecords, urlConnection.getContentLength())) {
                            archiveCount++;
                        }
                        downloadCount++;
                    } finally {
                        stream2.close();
                    }
                }
            } else {
                String value = Streams.asString(stream);
                parameters.put(name, value);
            // TODO: We really want to store this on the request so it's available for everyone
            //                    request.getParameterMap().put(name, value);
            }
        } else {
            // is file content
            String fileName = fileItem.getName();
            if (fileName.length() > 0) {
                long fileSize = fileItem.getSize();
                File file = allocateFile(rawDataDir, fileName);
                JSONObject fileRecord = new JSONObject();
                JSONUtilities.safePut(fileRecord, "origin", "upload");
                JSONUtilities.safePut(fileRecord, "declaredEncoding", request.getCharacterEncoding());
                JSONUtilities.safePut(fileRecord, "declaredMimeType", fileItem.getContentType());
                JSONUtilities.safePut(fileRecord, "fileName", fileName);
                JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
                progress.setProgress("Saving file " + fileName + " locally (" + formatBytes(fileSize) + " bytes)", calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
                JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
                if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) {
                    archiveCount++;
                }
                uploadCount++;
            }
        }
        stream.close();
    }
    // Delete all temp files.
    for (FileItem fileItem : tempFiles) {
        fileItem.delete();
    }
    JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount);
    JSONUtilities.safePut(retrievalRecord, "downloadCount", downloadCount);
    JSONUtilities.safePut(retrievalRecord, "clipboardCount", clipboardCount);
    JSONUtilities.safePut(retrievalRecord, "archiveCount", archiveCount);
}
Also used : HttpEntity(org.apache.http.HttpEntity) HttpGet(org.apache.http.client.methods.HttpGet) URL(java.net.URL) DefaultHttpClient(org.apache.http.impl.client.DefaultHttpClient) Result(com.google.refine.importing.UrlRewriter.Result) ServletFileUpload(org.apache.commons.fileupload.servlet.ServletFileUpload) List(java.util.List) ArrayList(java.util.ArrayList) GZIPInputStream(java.util.zip.GZIPInputStream) ZipInputStream(java.util.zip.ZipInputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) FileInputStream(java.io.FileInputStream) TarInputStream(org.apache.tools.tar.TarInputStream) InputStream(java.io.InputStream) JSONArray(org.json.JSONArray) HttpResponse(org.apache.http.HttpResponse) DiskFileItemFactory(org.apache.commons.fileupload.disk.DiskFileItemFactory) DecompressingHttpClient(org.apache.http.impl.client.DecompressingHttpClient) ServletException(javax.servlet.ServletException) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) URLConnection(java.net.URLConnection) UsernamePasswordCredentials(org.apache.http.auth.UsernamePasswordCredentials) FileItem(org.apache.commons.fileupload.FileItem) ProgressListener(org.apache.commons.fileupload.ProgressListener) JSONObject(org.json.JSONObject) AuthScope(org.apache.http.auth.AuthScope) File(java.io.File)

Example 2 with Result

use of com.google.refine.importing.UrlRewriter.Result in project OpenRefine by OpenRefine.

the class ImportingUtilities method retrieveContentFromPostRequest.

public static void retrieveContentFromPostRequest(HttpServletRequest request, Properties parameters, File rawDataDir, ObjectNode retrievalRecord, final Progress progress) throws IOException, FileUploadException {
    ArrayNode fileRecords = ParsingUtilities.mapper.createArrayNode();
    JSONUtilities.safePut(retrievalRecord, "files", fileRecords);
    int clipboardCount = 0;
    int uploadCount = 0;
    int downloadCount = 0;
    int archiveCount = 0;
    // This tracks the total progress, which involves uploading data from the client
    // as well as downloading data from URLs.
    final SavingUpdate update = new SavingUpdate() {

        @Override
        public void savedMore() {
            progress.setProgress(null, calculateProgressPercent(totalExpectedSize, totalRetrievedSize));
        }

        @Override
        public boolean isCanceled() {
            return progress.isCanceled();
        }
    };
    DiskFileItemFactory fileItemFactory = new DiskFileItemFactory();
    ServletFileUpload upload = new ServletFileUpload(fileItemFactory);
    upload.setProgressListener(new ProgressListener() {

        boolean setContentLength = false;

        long lastBytesRead = 0;

        @Override
        public void update(long bytesRead, long contentLength, int itemCount) {
            if (!setContentLength) {
                // Only try to set the content length if we really know it.
                if (contentLength >= 0) {
                    update.totalExpectedSize += contentLength;
                    setContentLength = true;
                }
            }
            if (setContentLength) {
                update.totalRetrievedSize += (bytesRead - lastBytesRead);
                lastBytesRead = bytesRead;
                update.savedMore();
            }
        }
    });
    List<FileItem> tempFiles = (List<FileItem>) upload.parseRequest(request);
    progress.setProgress("Uploading data ...", -1);
    parts: for (FileItem fileItem : tempFiles) {
        if (progress.isCanceled()) {
            break;
        }
        InputStream stream = fileItem.getInputStream();
        String name = fileItem.getFieldName().toLowerCase();
        if (fileItem.isFormField()) {
            if (name.equals("clipboard")) {
                String encoding = request.getCharacterEncoding();
                if (encoding == null) {
                    encoding = "UTF-8";
                }
                File file = allocateFile(rawDataDir, "clipboard.txt");
                ObjectNode fileRecord = ParsingUtilities.mapper.createObjectNode();
                JSONUtilities.safePut(fileRecord, "origin", "clipboard");
                JSONUtilities.safePut(fileRecord, "declaredEncoding", encoding);
                JSONUtilities.safePut(fileRecord, "declaredMimeType", (String) null);
                JSONUtilities.safePut(fileRecord, "format", "text");
                JSONUtilities.safePut(fileRecord, "fileName", "(clipboard)");
                JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
                progress.setProgress("Uploading pasted clipboard text", calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
                JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
                JSONUtilities.append(fileRecords, fileRecord);
                clipboardCount++;
            } else if (name.equals("download")) {
                String urlString = Streams.asString(stream);
                URL url = new URL(urlString);
                ObjectNode fileRecord = ParsingUtilities.mapper.createObjectNode();
                JSONUtilities.safePut(fileRecord, "origin", "download");
                JSONUtilities.safePut(fileRecord, "url", urlString);
                for (UrlRewriter rewriter : ImportingManager.urlRewriters) {
                    Result result = rewriter.rewrite(urlString);
                    if (result != null) {
                        urlString = result.rewrittenUrl;
                        url = new URL(urlString);
                        JSONUtilities.safePut(fileRecord, "url", urlString);
                        JSONUtilities.safePut(fileRecord, "format", result.format);
                        if (!result.download) {
                            downloadCount++;
                            JSONUtilities.append(fileRecords, fileRecord);
                            continue parts;
                        }
                    }
                }
                if ("http".equals(url.getProtocol()) || "https".equals(url.getProtocol())) {
                    final URL lastUrl = url;
                    final HttpClientResponseHandler<String> responseHandler = new HttpClientResponseHandler<String>() {

                        @Override
                        public String handleResponse(final ClassicHttpResponse response) throws IOException {
                            final int status = response.getCode();
                            if (status >= HttpStatus.SC_SUCCESS && status < HttpStatus.SC_REDIRECTION) {
                                final HttpEntity entity = response.getEntity();
                                if (entity == null) {
                                    throw new IOException("No content found in " + lastUrl.toExternalForm());
                                }
                                try {
                                    InputStream stream2 = entity.getContent();
                                    String mimeType = null;
                                    String charset = null;
                                    ContentType contentType = ContentType.parse(entity.getContentType());
                                    if (contentType != null) {
                                        mimeType = contentType.getMimeType();
                                        Charset cs = contentType.getCharset();
                                        if (cs != null) {
                                            charset = cs.toString();
                                        }
                                    }
                                    JSONUtilities.safePut(fileRecord, "declaredMimeType", mimeType);
                                    JSONUtilities.safePut(fileRecord, "declaredEncoding", charset);
                                    if (saveStream(stream2, lastUrl, rawDataDir, progress, update, fileRecord, fileRecords, entity.getContentLength())) {
                                        // signal to increment archive count
                                        return "saved";
                                    }
                                } catch (final IOException ex) {
                                    throw new ClientProtocolException(ex);
                                }
                                return null;
                            } else {
                                // String errorBody = EntityUtils.toString(response.getEntity());
                                throw new ClientProtocolException(String.format("HTTP error %d : %s for URL %s", status, response.getReasonPhrase(), lastUrl.toExternalForm()));
                            }
                        }
                    };
                    HttpClient httpClient = new HttpClient();
                    if (httpClient.getResponse(urlString, null, responseHandler) != null) {
                        archiveCount++;
                    }
                    ;
                    downloadCount++;
                } else {
                    // Fallback handling for non HTTP connections (only FTP?)
                    URLConnection urlConnection = url.openConnection();
                    urlConnection.setConnectTimeout(5000);
                    urlConnection.connect();
                    InputStream stream2 = urlConnection.getInputStream();
                    JSONUtilities.safePut(fileRecord, "declaredEncoding", urlConnection.getContentEncoding());
                    JSONUtilities.safePut(fileRecord, "declaredMimeType", urlConnection.getContentType());
                    try {
                        if (saveStream(stream2, url, rawDataDir, progress, update, fileRecord, fileRecords, urlConnection.getContentLength())) {
                            archiveCount++;
                        }
                        downloadCount++;
                    } finally {
                        stream2.close();
                    }
                }
            } else {
                String value = Streams.asString(stream);
                parameters.put(name, value);
            // TODO: We really want to store this on the request so it's available for everyone
            // request.getParameterMap().put(name, value);
            }
        } else {
            // is file content
            String fileName = fileItem.getName();
            if (fileName.length() > 0) {
                long fileSize = fileItem.getSize();
                File file = allocateFile(rawDataDir, fileName);
                ObjectNode fileRecord = ParsingUtilities.mapper.createObjectNode();
                JSONUtilities.safePut(fileRecord, "origin", "upload");
                JSONUtilities.safePut(fileRecord, "declaredEncoding", request.getCharacterEncoding());
                JSONUtilities.safePut(fileRecord, "declaredMimeType", fileItem.getContentType());
                JSONUtilities.safePut(fileRecord, "fileName", fileName);
                JSONUtilities.safePut(fileRecord, "location", getRelativePath(file, rawDataDir));
                progress.setProgress("Saving file " + fileName + " locally (" + formatBytes(fileSize) + " bytes)", calculateProgressPercent(update.totalExpectedSize, update.totalRetrievedSize));
                JSONUtilities.safePut(fileRecord, "size", saveStreamToFile(stream, file, null));
                // TODO: This needs to be refactored to be able to test import from archives
                if (postProcessRetrievedFile(rawDataDir, file, fileRecord, fileRecords, progress)) {
                    archiveCount++;
                }
                uploadCount++;
            }
        }
        stream.close();
    }
    // Delete all temp files.
    for (FileItem fileItem : tempFiles) {
        fileItem.delete();
    }
    JSONUtilities.safePut(retrievalRecord, "uploadCount", uploadCount);
    JSONUtilities.safePut(retrievalRecord, "downloadCount", downloadCount);
    JSONUtilities.safePut(retrievalRecord, "clipboardCount", clipboardCount);
    JSONUtilities.safePut(retrievalRecord, "archiveCount", archiveCount);
}
Also used : HttpEntity(org.apache.hc.core5.http.HttpEntity) ContentType(org.apache.hc.core5.http.ContentType) URL(java.net.URL) Result(com.google.refine.importing.UrlRewriter.Result) ClientProtocolException(org.apache.hc.client5.http.ClientProtocolException) ServletFileUpload(org.apache.commons.fileupload.servlet.ServletFileUpload) List(java.util.List) ArrayList(java.util.ArrayList) ArrayNode(com.fasterxml.jackson.databind.node.ArrayNode) ClassicHttpResponse(org.apache.hc.core5.http.ClassicHttpResponse) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) GZIPInputStream(java.util.zip.GZIPInputStream) ZipInputStream(java.util.zip.ZipInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Charset(java.nio.charset.Charset) IOException(java.io.IOException) DiskFileItemFactory(org.apache.commons.fileupload.disk.DiskFileItemFactory) URLConnection(java.net.URLConnection) FileItem(org.apache.commons.fileupload.FileItem) HttpClientResponseHandler(org.apache.hc.core5.http.io.HttpClientResponseHandler) ProgressListener(org.apache.commons.fileupload.ProgressListener) HttpClient(com.google.refine.util.HttpClient) File(java.io.File)

Aggregations

Result (com.google.refine.importing.UrlRewriter.Result)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 URL (java.net.URL)2 URLConnection (java.net.URLConnection)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 ZipInputStream (java.util.zip.ZipInputStream)2 FileItem (org.apache.commons.fileupload.FileItem)2 ProgressListener (org.apache.commons.fileupload.ProgressListener)2 DiskFileItemFactory (org.apache.commons.fileupload.disk.DiskFileItemFactory)2 ServletFileUpload (org.apache.commons.fileupload.servlet.ServletFileUpload)2 ArrayNode (com.fasterxml.jackson.databind.node.ArrayNode)1 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)1 HttpClient (com.google.refine.util.HttpClient)1 FileNotFoundException (java.io.FileNotFoundException)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1