Examples with ParsedField - nl.basjes.parse.core.ParsedField

Example 6 with ParsedField

use of nl.basjes.parse.core.ParsedField in project logparser by nielsbasjes.

the class StrfTimeStampDissector method dissect.

@Override
public void dissect(Parsable<?> parsable, String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(inputType, inputname);
    timeStampDissector.dissect(field, parsable, inputname);
}

Also used : ParsedField(nl.basjes.parse.core.ParsedField)

Example 7 with ParsedField

use of nl.basjes.parse.core.ParsedField in project logparser by nielsbasjes.

the class HttpFirstLineProtocolDissector method dissect.

// --------------------------------------------
@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);
    final String fieldValue = field.getValue().getString();
    if (fieldValue == null || fieldValue.isEmpty() || "-".equals(fieldValue)) {
        // Nothing to do here
        return;
    }
    String[] protocol = fieldValue.split("/", 2);
    if (protocol.length == 2) {
        outputDissection(parsable, inputname, "HTTP.PROTOCOL", "", protocol[0]);
        outputDissection(parsable, inputname, "HTTP.PROTOCOL.VERSION", "version", protocol[1]);
        return;
    }
    // In the scenario that the actual URI is too long the last part ("HTTP/1.1") may have been cut off by the
    // Apache HTTPD webserver. To still be able to parse these we try that pattern too
    parsable.addDissection(inputname, "HTTP.PROTOCOL", "", (String) null);
    parsable.addDissection(inputname, "HTTP.PROTOCOL.VERSION", "version", (String) null);
}

Also used : ParsedField(nl.basjes.parse.core.ParsedField)

Example 8 with ParsedField

use of nl.basjes.parse.core.ParsedField in project yauaa by nielsbasjes.

the class UserAgentDissector method dissect.

@Override
public void dissect(Parsable<?> parsable, String inputname) throws DissectionFailure {
    final ParsedField agentField = parsable.getParsableField(INPUT_TYPE, inputname);
    String userAgentString = agentField.getValue().getString();
    if (userAgentString == null) {
        // Weird, but it happens
        return;
    }
    UserAgent agent = userAgentAnalyzer.parse(userAgentString);
    for (String fieldName : requestedFieldNames) {
        parsable.addDissection(inputname, getFieldOutputType(fieldName), fieldNameToDissectionName(fieldName), agent.getValue(fieldName));
    }
}

Also used : ParsedField(nl.basjes.parse.core.ParsedField) UserAgent(nl.basjes.parse.useragent.UserAgent)

Example 9 with ParsedField

use of nl.basjes.parse.core.ParsedField in project logparser by nielsbasjes.

the class HttpFirstLineDissector method dissect.

// --------------------------------------------
@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(HTTP_FIRSTLINE, inputname);
    final String fieldValue = field.getValue().getString();
    if (fieldValue == null || fieldValue.isEmpty() || "-".equals(fieldValue)) {
        // Nothing to do here
        return;
    }
    // Now we create a matcher for this line
    Matcher matcher = firstlineSplitter.matcher(fieldValue);
    // Is it all as expected?
    boolean matches = matcher.find();
    if (matches && matcher.groupCount() == 3) {
        outputDissection(parsable, inputname, "HTTP.METHOD", "method", matcher, 1);
        outputDissection(parsable, inputname, "HTTP.URI", "uri", matcher, 2);
        outputDissection(parsable, inputname, "HTTP.PROTOCOL_VERSION", "protocol", matcher, 3);
        return;
    }
    // In the scenario that the actual URI is too long the last part ("HTTP/1.1") may have been cut off by the
    // Apache HTTPD webserver. To still be able to parse these we try that pattern too
    // Now we create a matcher for this line
    matcher = tooLongFirstlineSplitter.matcher(fieldValue);
    // Is it all as expected?
    matches = matcher.find();
    if (matches && matcher.groupCount() == 2) {
        outputDissection(parsable, inputname, "HTTP.METHOD", "method", matcher, 1);
        outputDissection(parsable, inputname, "HTTP.URI", "uri", matcher, 2);
        parsable.addDissection(inputname, "HTTP.PROTOCOL_VERSION", "protocol", (String) null);
    }
}

Also used : Matcher(java.util.regex.Matcher) ParsedField(nl.basjes.parse.core.ParsedField)

Example 10 with ParsedField

use of nl.basjes.parse.core.ParsedField in project logparser by nielsbasjes.

the class HttpUriDissector method dissect.

@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);
    String uriString = field.getValue().getString();
    if (uriString == null || uriString.isEmpty()) {
        // Nothing to do here
        return;
    }
    // First we cleanup the URI so we fail less often over 'garbage' URIs.
    // See: https://stackoverflow.com/questions/11038967/brackets-in-a-request-url-are-legal-but-not-in-a-uri-java
    uriString = new String(URLCodec.encodeUrl(BAD_URI_CHARS, uriString.getBytes(UTF_8)), US_ASCII);
    // Now we translate any HTML encoded entities/characters into URL UTF-8 encoded characters
    uriString = makeHTMLEncodedInert(uriString);
    // Before we hand it to the standard parser we hack it around a bit so we can parse
    // nasty edge cases that are illegal yet do occur in real clickstreams.
    // Also we force the query string to start with ?& so the returned query string starts with &
    // Which leads to more consistent output after parsing.
    int firstQuestionMark = uriString.indexOf('?');
    int firstAmpersand = uriString.indexOf('&');
    // to:  ?&x=x&y=y&z=z
    if (firstAmpersand != -1 || firstQuestionMark != -1) {
        uriString = uriString.replaceAll("\\?", "&");
        uriString = uriString.replaceFirst("&", "?&");
    }
    // We find that people muck up the URL by putting % signs in the URLs that are NOT escape sequences
    // So any % that is not followed by a two 'hex' letters is fixed
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
    // We have URIs with fragments like this:
    // /path/?_requestid=1234#x3D;12341234&Referrer&#x3D;blablabla
    // So first we repair the broken encoded char
    uriString = ALMOST_HTML_ENCODED.matcher(uriString).replaceAll("$1&$2");
    uriString = StringEscapeUtils.unescapeHtml4(uriString);
    // And we see URIs with this:
    // /path/?Referrer=ADV1234#&f=API&subid=#&name=12341234
    uriString = EQUALS_HASH_PATTERN.matcher(uriString).replaceAll("=");
    uriString = HASH_AMP_PATTERN.matcher(uriString).replaceAll("&");
    // If we still have multiple '#' in here we replace them with something else: '~'
    while (true) {
        Matcher doubleHashMatcher = DOUBLE_HASH_PATTERN.matcher(uriString);
        if (!doubleHashMatcher.find()) {
            break;
        }
        uriString = doubleHashMatcher.replaceAll("~$1#");
    }
    boolean isUrl = true;
    URI uri;
    try {
        if (uriString.charAt(0) == '/') {
            uri = URI.create("dummy-protocol://dummy.host.name" + uriString);
            // I.e. we do not return the values we just faked.
            isUrl = false;
        } else {
            uri = URI.create(uriString);
        }
    } catch (IllegalArgumentException e) {
        throw new DissectionFailure("Failed to parse URI >>" + field.getValue().getString() + "<< because of : " + e.getMessage());
    }
    if (wantQuery || wantPath || wantRef) {
        if (wantQuery) {
            String value = uri.getRawQuery();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.QUERYSTRING", "query", value);
            }
        }
        if (wantPath) {
            String value = uri.getPath();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.PATH", "path", value);
            }
        }
        if (wantRef) {
            String value = uri.getFragment();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.REF", "ref", value);
            }
        }
    }
    if (isUrl) {
        if (wantProtocol) {
            String value = uri.getScheme();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.PROTOCOL", "protocol", value);
            }
        }
        if (wantUserinfo) {
            String value = uri.getUserInfo();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.USERINFO", "userinfo", value);
            }
        }
        if (wantHost) {
            String value = uri.getHost();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.HOST", "host", value);
            }
        }
        if (wantPort) {
            int value = uri.getPort();
            if (value != -1) {
                parsable.addDissection(inputname, "HTTP.PORT", "port", value);
            }
        }
    }
}

Also used : Matcher(java.util.regex.Matcher) ParsedField(nl.basjes.parse.core.ParsedField) URI(java.net.URI) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Aggregations

ParsedField (nl.basjes.parse.core.ParsedField)16 DissectionFailure (nl.basjes.parse.core.exceptions.DissectionFailure)4 Matcher (java.util.regex.Matcher)3 HttpCookie (java.net.HttpCookie)1 InetAddress (java.net.InetAddress)1 URI (java.net.URI)1 UnknownHostException (java.net.UnknownHostException)1 UserAgent (nl.basjes.parse.useragent.UserAgent)1