Search in sources :

Example 1 with DissectionFailure

use of nl.basjes.parse.core.exceptions.DissectionFailure in project logparser by nielsbasjes.

the class TokenFormatDissector method dissect.

@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    if (!isUsable) {
        throw new DissectionFailure("Dissector in unusable state");
    }
    final ParsedField line = parsable.getParsableField(inputType, inputname);
    // Now we create a matcher for this line
    final Matcher matcher = logFormatPattern.matcher(line.getValue().getString());
    // Is it all as expected?
    final boolean matches = matcher.find();
    if (matches) {
        for (int i = 1; i <= matcher.groupCount(); i++) {
            String matchedStr = matcher.group(i);
            Token token = logFormatUsedTokens.get(i - 1);
            for (TokenOutputField tokenOutputField : token.getOutputFields()) {
                final String matchedName = tokenOutputField.getName();
                final String matchedType = tokenOutputField.getType();
                parsable.addDissection(inputname, matchedType, matchedName, decodeExtractedValue(matchedName, matchedStr));
            }
        }
    } else {
        throw new DissectionFailure("The input line does not match the specified log format." + "Line     : " + line.getValue() + "\n" + "LogFormat: " + logFormat + "\n" + "RegEx    : " + logFormatRegEx);
    }
}
Also used : Matcher(java.util.regex.Matcher) ParsedField(nl.basjes.parse.core.ParsedField) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Example 2 with DissectionFailure

use of nl.basjes.parse.core.exceptions.DissectionFailure in project logparser by nielsbasjes.

the class RequestCookieListDissector method dissect.

@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);
    final String fieldValue = field.getValue().getString();
    if (fieldValue == null || fieldValue.isEmpty()) {
        // Nothing to do here
        return;
    }
    String[] allValues = fieldSeparatorPattern.split(fieldValue);
    for (String value : allValues) {
        int equalPos = value.indexOf('=');
        if (equalPos == -1) {
            if (!"".equals(value)) {
                // Just a name, no value
                String theName = value.trim().toLowerCase();
                if (wantAllCookies || requestedCookies.contains(theName)) {
                    parsable.addDissection(inputname, "HTTP.COOKIE", theName, "");
                }
            }
        } else {
            String theName = value.substring(0, equalPos).trim().toLowerCase();
            if (wantAllCookies || requestedCookies.contains(theName)) {
                String theValue = value.substring(equalPos + 1, value.length()).trim();
                try {
                    parsable.addDissection(inputname, "HTTP.COOKIE", theName, Utils.resilientUrlDecode(theValue));
                } catch (IllegalArgumentException e) {
                    // This usually means that there was invalid encoding in the line
                    throw new DissectionFailure(e.getMessage());
                }
            }
        }
    }
}
Also used : ParsedField(nl.basjes.parse.core.ParsedField) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Example 3 with DissectionFailure

use of nl.basjes.parse.core.exceptions.DissectionFailure in project logparser by nielsbasjes.

the class TimeStampDissector method dissect.

protected void dissect(ParsedField field, final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    String fieldValue = field.getValue().getString();
    if (fieldValue == null || fieldValue.isEmpty()) {
        // Nothing to do here
        return;
    }
    ZonedDateTime dateTime;
    try {
        dateTime = parse(fieldValue);
    } catch (DateTimeParseException dtpe) {
        throw new DissectionFailure(dtpe.getMessage() + "\n          10        20        30        40        50        60        70        80        90        100       110       120" + "\n_123456789_123456789_123456789_123456789_123456789_123456789_123456789_123456789_123456789_123456789_123456789_123456789_" + "\n" + fieldValue + "\n\n" + formatter.toString(), dtpe);
    }
    if (wantAnyTZIndependent) {
        // Timezone independent
        if (wantTimezone) {
            parsable.addDissection(inputname, "TIME.ZONE", "timezone", dateTime.getZone().getDisplayName(TextStyle.FULL, locale));
        }
        if (wantEpoch) {
            parsable.addDissection(inputname, "TIME.EPOCH", "epoch", dateTime.toInstant().toEpochMilli());
        }
    }
    if (wantAnyAsParsed) {
        LocalDateTime localDateTime = dateTime.toLocalDateTime();
        // As parsed
        if (wantDay) {
            parsable.addDissection(inputname, "TIME.DAY", "day", localDateTime.getDayOfMonth());
        }
        if (wantMonthname) {
            parsable.addDissection(inputname, "TIME.MONTHNAME", "monthname", localDateTime.getMonth().getDisplayName(TextStyle.FULL, locale));
        }
        if (wantMonth) {
            parsable.addDissection(inputname, "TIME.MONTH", "month", localDateTime.getMonth().getValue());
        }
        if (wantWeekOfWeekYear) {
            parsable.addDissection(inputname, "TIME.WEEK", "weekofweekyear", localDateTime.get(WeekFields.of(locale).weekOfWeekBasedYear()));
        }
        if (wantWeekYear) {
            parsable.addDissection(inputname, "TIME.YEAR", "weekyear", localDateTime.get(WeekFields.of(locale).weekBasedYear()));
        }
        if (wantYear) {
            parsable.addDissection(inputname, "TIME.YEAR", "year", localDateTime.getYear());
        }
        if (wantHour) {
            parsable.addDissection(inputname, "TIME.HOUR", "hour", localDateTime.getHour());
        }
        if (wantMinute) {
            parsable.addDissection(inputname, "TIME.MINUTE", "minute", localDateTime.getMinute());
        }
        if (wantSecond) {
            parsable.addDissection(inputname, "TIME.SECOND", "second", localDateTime.getSecond());
        }
        if (wantMillisecond) {
            parsable.addDissection(inputname, "TIME.MILLISECOND", "millisecond", localDateTime.getNano() / 1000000L);
        }
        if (wantMicrosecond) {
            parsable.addDissection(inputname, "TIME.MICROSECOND", "microsecond", localDateTime.getNano() / 1000L);
        }
        if (wantNanosecond) {
            parsable.addDissection(inputname, "TIME.NANOSECOND", "nanosecond", localDateTime.getNano());
        }
        if (wantDate) {
            parsable.addDissection(inputname, "TIME.DATE", "date", localDateTime.format(ISO_DATE_FORMATTER));
        }
        if (wantTime) {
            parsable.addDissection(inputname, "TIME.TIME", "time", localDateTime.format(ISO_TIME_FORMATTER));
        }
    }
    if (wantAnyUTC) {
        // In UTC timezone
        ZonedDateTime zonedDateTime = dateTime.withZoneSameInstant(ZoneOffset.UTC);
        if (wantDayUTC) {
            parsable.addDissection(inputname, "TIME.DAY", "day_utc", zonedDateTime.getDayOfMonth());
        }
        if (wantMonthnameUTC) {
            parsable.addDissection(inputname, "TIME.MONTHNAME", "monthname_utc", zonedDateTime.getMonth().getDisplayName(TextStyle.FULL, locale));
        }
        if (wantMonthUTC) {
            parsable.addDissection(inputname, "TIME.MONTH", "month_utc", zonedDateTime.getMonthValue());
        }
        if (wantWeekOfWeekYearUTC) {
            parsable.addDissection(inputname, "TIME.WEEK", "weekofweekyear_utc", zonedDateTime.get(WeekFields.ISO.weekOfWeekBasedYear()));
        }
        if (wantWeekYearUTC) {
            parsable.addDissection(inputname, "TIME.YEAR", "weekyear_utc", zonedDateTime.get(WeekFields.ISO.weekBasedYear()));
        }
        if (wantYearUTC) {
            parsable.addDissection(inputname, "TIME.YEAR", "year_utc", zonedDateTime.getYear());
        }
        if (wantHourUTC) {
            parsable.addDissection(inputname, "TIME.HOUR", "hour_utc", zonedDateTime.getHour());
        }
        if (wantMinuteUTC) {
            parsable.addDissection(inputname, "TIME.MINUTE", "minute_utc", zonedDateTime.getMinute());
        }
        if (wantSecondUTC) {
            parsable.addDissection(inputname, "TIME.SECOND", "second_utc", zonedDateTime.getSecond());
        }
        if (wantMillisecondUTC) {
            parsable.addDissection(inputname, "TIME.MILLISECOND", "millisecond_utc", zonedDateTime.getNano() / 1000000L);
        }
        if (wantMicrosecondUTC) {
            parsable.addDissection(inputname, "TIME.MICROSECOND", "microsecond_utc", zonedDateTime.getNano() / 1000L);
        }
        if (wantNanosecondUTC) {
            parsable.addDissection(inputname, "TIME.NANOSECOND", "nanosecond_utc", zonedDateTime.getNano());
        }
        if (wantDateUTC) {
            parsable.addDissection(inputname, "TIME.DATE", "date_utc", zonedDateTime.format(ISO_DATE_FORMATTER));
        }
        if (wantTimeUTC) {
            parsable.addDissection(inputname, "TIME.TIME", "time_utc", zonedDateTime.format(ISO_TIME_FORMATTER));
        }
    }
}
Also used : LocalDateTime(java.time.LocalDateTime) DateTimeParseException(java.time.format.DateTimeParseException) ZonedDateTime(java.time.ZonedDateTime) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Example 4 with DissectionFailure

use of nl.basjes.parse.core.exceptions.DissectionFailure in project logparser by nielsbasjes.

the class ApacheHttpdlogDeserializer method deserialize.

@Override
public Object deserialize(Writable writable) throws SerDeException {
    if (!(writable instanceof Text)) {
        throw new SerDeException("The input MUST be a Text line.");
    }
    linesInput++;
    try {
        currentValue.clear();
        parser.parse(currentValue, writable.toString());
    } catch (DissectionFailure dissectionFailure) {
        linesBad++;
        if (linesInput >= MINIMAL_FAIL_LINES) {
            if (100 * linesBad > MINIMAL_FAIL_PERCENTAGE * linesInput) {
                throw new SerDeException("To many bad lines: " + linesBad + " of " + linesInput + " are bad.");
            }
        }
        // Just return that this line is nothing.
        return null;
    } catch (InvalidDissectorException | MissingDissectorsException e) {
        throw new SerDeException("Cannot continue; Fix the Dissectors before retrying", e);
    }
    for (ColumnToGetterMapping ctgm : columnToGetterMappings) {
        switch(ctgm.casts) {
            case STRING:
                String currentValueString = currentValue.getString(ctgm.fieldValue);
                row.set(ctgm.index, currentValueString);
                break;
            case LONG:
                Long currentValueLong = currentValue.getLong(ctgm.fieldValue);
                row.set(ctgm.index, currentValueLong);
                break;
            case DOUBLE:
                Double currentValueDouble = currentValue.getDouble(ctgm.fieldValue);
                row.set(ctgm.index, currentValueDouble);
                break;
            default:
        }
    }
    return row;
}
Also used : MissingDissectorsException(nl.basjes.parse.core.exceptions.MissingDissectorsException) Text(org.apache.hadoop.io.Text) InvalidDissectorException(nl.basjes.parse.core.exceptions.InvalidDissectorException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Example 5 with DissectionFailure

use of nl.basjes.parse.core.exceptions.DissectionFailure in project logparser by nielsbasjes.

the class HttpUriDissector method dissect.

@Override
public void dissect(final Parsable<?> parsable, final String inputname) throws DissectionFailure {
    final ParsedField field = parsable.getParsableField(INPUT_TYPE, inputname);
    String uriString = field.getValue().getString();
    if (uriString == null || uriString.isEmpty()) {
        // Nothing to do here
        return;
    }
    // First we cleanup the URI so we fail less often over 'garbage' URIs.
    // See: https://stackoverflow.com/questions/11038967/brackets-in-a-request-url-are-legal-but-not-in-a-uri-java
    uriString = new String(URLCodec.encodeUrl(BAD_URI_CHARS, uriString.getBytes(UTF_8)), US_ASCII);
    // Now we translate any HTML encoded entities/characters into URL UTF-8 encoded characters
    uriString = makeHTMLEncodedInert(uriString);
    // Before we hand it to the standard parser we hack it around a bit so we can parse
    // nasty edge cases that are illegal yet do occur in real clickstreams.
    // Also we force the query string to start with ?& so the returned query string starts with &
    // Which leads to more consistent output after parsing.
    int firstQuestionMark = uriString.indexOf('?');
    int firstAmpersand = uriString.indexOf('&');
    // to:  ?&x=x&y=y&z=z
    if (firstAmpersand != -1 || firstQuestionMark != -1) {
        uriString = uriString.replaceAll("\\?", "&");
        uriString = uriString.replaceFirst("&", "?&");
    }
    // We find that people muck up the URL by putting % signs in the URLs that are NOT escape sequences
    // So any % that is not followed by a two 'hex' letters is fixed
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
    uriString = BAD_EXCAPE_PATTERN.matcher(uriString).replaceAll("%25$1");
    // We have URIs with fragments like this:
    // /path/?_requestid=1234#x3D;12341234&Referrer&#x3D;blablabla
    // So first we repair the broken encoded char
    uriString = ALMOST_HTML_ENCODED.matcher(uriString).replaceAll("$1&$2");
    uriString = StringEscapeUtils.unescapeHtml4(uriString);
    // And we see URIs with this:
    // /path/?Referrer=ADV1234#&f=API&subid=#&name=12341234
    uriString = EQUALS_HASH_PATTERN.matcher(uriString).replaceAll("=");
    uriString = HASH_AMP_PATTERN.matcher(uriString).replaceAll("&");
    // If we still have multiple '#' in here we replace them with something else: '~'
    while (true) {
        Matcher doubleHashMatcher = DOUBLE_HASH_PATTERN.matcher(uriString);
        if (!doubleHashMatcher.find()) {
            break;
        }
        uriString = doubleHashMatcher.replaceAll("~$1#");
    }
    boolean isUrl = true;
    URI uri;
    try {
        if (uriString.charAt(0) == '/') {
            uri = URI.create("dummy-protocol://dummy.host.name" + uriString);
            // I.e. we do not return the values we just faked.
            isUrl = false;
        } else {
            uri = URI.create(uriString);
        }
    } catch (IllegalArgumentException e) {
        throw new DissectionFailure("Failed to parse URI >>" + field.getValue().getString() + "<< because of : " + e.getMessage());
    }
    if (wantQuery || wantPath || wantRef) {
        if (wantQuery) {
            String value = uri.getRawQuery();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.QUERYSTRING", "query", value);
            }
        }
        if (wantPath) {
            String value = uri.getPath();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.PATH", "path", value);
            }
        }
        if (wantRef) {
            String value = uri.getFragment();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.REF", "ref", value);
            }
        }
    }
    if (isUrl) {
        if (wantProtocol) {
            String value = uri.getScheme();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.PROTOCOL", "protocol", value);
            }
        }
        if (wantUserinfo) {
            String value = uri.getUserInfo();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.USERINFO", "userinfo", value);
            }
        }
        if (wantHost) {
            String value = uri.getHost();
            if (value != null && !value.isEmpty()) {
                parsable.addDissection(inputname, "HTTP.HOST", "host", value);
            }
        }
        if (wantPort) {
            int value = uri.getPort();
            if (value != -1) {
                parsable.addDissection(inputname, "HTTP.PORT", "port", value);
            }
        }
    }
}
Also used : Matcher(java.util.regex.Matcher) ParsedField(nl.basjes.parse.core.ParsedField) URI(java.net.URI) DissectionFailure(nl.basjes.parse.core.exceptions.DissectionFailure)

Aggregations

DissectionFailure (nl.basjes.parse.core.exceptions.DissectionFailure)6 ParsedField (nl.basjes.parse.core.ParsedField)4 Matcher (java.util.regex.Matcher)2 URI (java.net.URI)1 LocalDateTime (java.time.LocalDateTime)1 ZonedDateTime (java.time.ZonedDateTime)1 DateTimeParseException (java.time.format.DateTimeParseException)1 InvalidDissectorException (nl.basjes.parse.core.exceptions.InvalidDissectorException)1 MissingDissectorsException (nl.basjes.parse.core.exceptions.MissingDissectorsException)1 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)1 Text (org.apache.hadoop.io.Text)1