Search in sources :

Example 66 with FloatWritable

use of org.apache.hadoop.io.FloatWritable in project nifi by apache.

the class NiFiOrcUtils method convertToORCObject.

public static Object convertToORCObject(TypeInfo typeInfo, Object o) {
    if (o != null) {
        if (typeInfo instanceof UnionTypeInfo) {
            OrcUnion union = new OrcUnion();
            // Need to find which of the union types correspond to the primitive object
            TypeInfo objectTypeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(ObjectInspectorFactory.getReflectionObjectInspector(o.getClass(), ObjectInspectorFactory.ObjectInspectorOptions.JAVA));
            List<TypeInfo> unionTypeInfos = ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos();
            int index = 0;
            while (index < unionTypeInfos.size() && !unionTypeInfos.get(index).equals(objectTypeInfo)) {
                index++;
            }
            if (index < unionTypeInfos.size()) {
                union.set((byte) index, convertToORCObject(objectTypeInfo, o));
            } else {
                throw new IllegalArgumentException("Object Type for class " + o.getClass().getName() + " not in Union declaration");
            }
            return union;
        }
        if (o instanceof Integer) {
            return new IntWritable((int) o);
        }
        if (o instanceof Boolean) {
            return new BooleanWritable((boolean) o);
        }
        if (o instanceof Long) {
            return new LongWritable((long) o);
        }
        if (o instanceof Float) {
            return new FloatWritable((float) o);
        }
        if (o instanceof Double) {
            return new DoubleWritable((double) o);
        }
        if (o instanceof String || o instanceof Utf8 || o instanceof GenericData.EnumSymbol) {
            return new Text(o.toString());
        }
        if (o instanceof ByteBuffer) {
            return new BytesWritable(((ByteBuffer) o).array());
        }
        if (o instanceof int[]) {
            int[] intArray = (int[]) o;
            return Arrays.stream(intArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("int"), element)).collect(Collectors.toList());
        }
        if (o instanceof long[]) {
            long[] longArray = (long[]) o;
            return Arrays.stream(longArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("bigint"), element)).collect(Collectors.toList());
        }
        if (o instanceof float[]) {
            float[] floatArray = (float[]) o;
            return IntStream.range(0, floatArray.length).mapToDouble(i -> floatArray[i]).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("float"), (float) element)).collect(Collectors.toList());
        }
        if (o instanceof double[]) {
            double[] doubleArray = (double[]) o;
            return Arrays.stream(doubleArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("double"), element)).collect(Collectors.toList());
        }
        if (o instanceof boolean[]) {
            boolean[] booleanArray = (boolean[]) o;
            return IntStream.range(0, booleanArray.length).map(i -> booleanArray[i] ? 1 : 0).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("boolean"), element == 1)).collect(Collectors.toList());
        }
        if (o instanceof GenericData.Array) {
            GenericData.Array array = ((GenericData.Array) o);
            // The type information in this case is interpreted as a List
            TypeInfo listTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
            return array.stream().map((element) -> convertToORCObject(listTypeInfo, element)).collect(Collectors.toList());
        }
        if (o instanceof List) {
            return o;
        }
        if (o instanceof Map) {
            Map map = new HashMap();
            TypeInfo keyInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
            TypeInfo valueInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
            // Unions are not allowed as key/value types, so if we convert the key and value objects,
            // they should return Writable objects
            ((Map) o).forEach((key, value) -> {
                Object keyObject = convertToORCObject(keyInfo, key);
                Object valueObject = convertToORCObject(valueInfo, value);
                if (keyObject == null) {
                    throw new IllegalArgumentException("Maps' key cannot be null");
                }
                map.put(keyObject, valueObject);
            });
            return map;
        }
        if (o instanceof GenericData.Record) {
            GenericData.Record record = (GenericData.Record) o;
            TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
            List<Schema.Field> recordFields = record.getSchema().getFields();
            if (recordFields != null) {
                Object[] fieldObjects = new Object[recordFields.size()];
                for (int i = 0; i < recordFields.size(); i++) {
                    Schema.Field field = recordFields.get(i);
                    Schema fieldSchema = field.schema();
                    Object fieldObject = record.get(field.name());
                    fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
                }
                return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
            }
        }
        throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
    } else {
        return null;
    }
}
Also used : IntStream(java.util.stream.IntStream) TypeInfoUtils(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils) HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE) Arrays(java.util.Arrays) Text(org.apache.hadoop.io.Text) HashMap(java.util.HashMap) StringUtils(org.apache.commons.lang3.StringUtils) DoubleWritable(org.apache.hadoop.io.DoubleWritable) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) GenericData(org.apache.avro.generic.GenericData) ArrayList(java.util.ArrayList) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) HIVE_ORC_DEFAULT_BLOCK_SIZE(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_SIZE) BytesWritable(org.apache.hadoop.io.BytesWritable) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) IntWritable(org.apache.hadoop.io.IntWritable) OutputStream(java.io.OutputStream) Utf8(org.apache.avro.util.Utf8) HIVE_ORC_DEFAULT_BLOCK_PADDING(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_PADDING) Schema(org.apache.avro.Schema) HIVE_ORC_WRITE_FORMAT(org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_WRITE_FORMAT) TypeInfoFactory(org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) List(java.util.List) BooleanWritable(org.apache.hadoop.io.BooleanWritable) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) BloomFilterIO(org.apache.hadoop.hive.ql.io.filters.BloomFilterIO) FloatWritable(org.apache.hadoop.io.FloatWritable) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) DoubleWritable(org.apache.hadoop.io.DoubleWritable) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ArrayList(java.util.ArrayList) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo) GenericData(org.apache.avro.generic.GenericData) ByteBuffer(java.nio.ByteBuffer) FloatWritable(org.apache.hadoop.io.FloatWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) MapTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo) Utf8(org.apache.avro.util.Utf8) HashMap(java.util.HashMap) Map(java.util.Map) UnionTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo)

Example 67 with FloatWritable

use of org.apache.hadoop.io.FloatWritable in project nutch by apache.

the class UpdateHostDbReducer method reduce.

/**
 */
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
    Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
    Map<String, Float> maximums = new HashMap<>();
    // used to calc averages
    Map<String, Float> sums = new HashMap<>();
    // used to calc averages
    Map<String, Integer> counts = new HashMap<>();
    Map<String, Float> minimums = new HashMap<>();
    Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
    HostDatum hostDatum = new HostDatum();
    float score = 0;
    if (stringFields != null) {
        for (int i = 0; i < stringFields.length; i++) {
            stringCounts.put(stringFields[i], new HashMap<>());
        }
    }
    // an empty if this is a new host for the host db
    for (Writable value : values) {
        // Count crawl datum status's and collect metadata from fields
        if (value instanceof CrawlDatum) {
            CrawlDatum buffer = (CrawlDatum) value;
            // Set the correct status field
            switch(buffer.getStatus()) {
                case CrawlDatum.STATUS_DB_UNFETCHED:
                    hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_FETCHED:
                    hostDatum.setFetched(hostDatum.getFetched() + 1);
                    break;
                case CrawlDatum.STATUS_DB_GONE:
                    hostDatum.setGone(hostDatum.getGone() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_TEMP:
                    hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
                    break;
                case CrawlDatum.STATUS_DB_REDIR_PERM:
                    hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
                    break;
                case CrawlDatum.STATUS_DB_NOTMODIFIED:
                    hostDatum.setNotModified(hostDatum.getNotModified() + 1);
                    break;
            }
            // Record connection failures
            if (buffer.getRetriesSinceFetch() != 0) {
                hostDatum.incConnectionFailures();
            }
            // Only gather metadata statistics for proper fetched pages
            if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
                // Deal with the string fields
                if (stringFields != null) {
                    for (int i = 0; i < stringFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
                            // Get it!
                            String metadataValue = null;
                            try {
                                metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
                            } catch (Exception e) {
                                LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
                            }
                            // Does the value exist?
                            if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                                // Yes, increment it
                                stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
                            } else {
                                // Create it!
                                stringCounts.get(stringFields[i]).put(metadataValue, 1);
                            }
                        }
                    }
                }
                // Deal with the numeric fields
                if (numericFields != null) {
                    for (int i = 0; i < numericFields.length; i++) {
                        // Does this field exist?
                        if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
                            try {
                                // Get it!
                                Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
                                // Does the median value exist?
                                if (tdigests.containsKey(numericFields[i])) {
                                    tdigests.get(numericFields[i]).add(metadataValue);
                                } else {
                                    // Create it!
                                    TDigest tdigest = TDigest.createDigest(100);
                                    tdigest.add((double) metadataValue);
                                    tdigests.put(numericFields[i], tdigest);
                                }
                                // Does the minimum value exist?
                                if (minimums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue < minimums.get(numericFields[i])) {
                                        minimums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    minimums.put(numericFields[i], metadataValue);
                                }
                                // Does the maximum value exist?
                                if (maximums.containsKey(numericFields[i])) {
                                    // Write if this is lower than existing value
                                    if (metadataValue > maximums.get(numericFields[i])) {
                                        maximums.put(numericFields[i], metadataValue);
                                    }
                                } else {
                                    // Create it!
                                    maximums.put(numericFields[i], metadataValue);
                                }
                                // Sum it up!
                                if (sums.containsKey(numericFields[i])) {
                                    // Increment
                                    sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
                                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
                                } else {
                                    // Create it!
                                    sums.put(numericFields[i], metadataValue);
                                    counts.put(numericFields[i], 1);
                                }
                            } catch (Exception e) {
                                LOG.error(e.getMessage() + " when processing values for " + key.toString());
                            }
                        }
                    }
                }
            }
        }
        // 
        if (value instanceof HostDatum) {
            HostDatum buffer = (HostDatum) value;
            // Check homepage URL
            if (buffer.hasHomepageUrl()) {
                hostDatum.setHomepageUrl(buffer.getHomepageUrl());
            }
            // Check lastCheck timestamp
            if (!buffer.isEmpty()) {
                hostDatum.setLastCheck(buffer.getLastCheck());
            }
            // Check and set DNS failures
            if (buffer.getDnsFailures() > 0) {
                hostDatum.setDnsFailures(buffer.getDnsFailures());
            }
            // Check and set connection failures
            if (buffer.getConnectionFailures() > 0) {
                hostDatum.setConnectionFailures(buffer.getConnectionFailures());
            }
            // Check metadata
            if (!buffer.getMetaData().isEmpty()) {
                hostDatum.setMetaData(buffer.getMetaData());
            }
            // Check and set score (score from Web Graph has precedence)
            if (buffer.getScore() > 0) {
                hostDatum.setScore(buffer.getScore());
            }
        }
        // Check for the score
        if (value instanceof FloatWritable) {
            FloatWritable buffer = (FloatWritable) value;
            score = buffer.get();
        }
    }
    // Check if score was set from Web Graph
    if (score > 0) {
        hostDatum.setScore(score);
    }
    // Set metadata
    for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
        for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
            hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
        }
    }
    for (Map.Entry<String, Float> entry : maximums.entrySet()) {
        hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    for (Map.Entry<String, Float> entry : sums.entrySet()) {
        hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
    }
    for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
        // Emit all percentiles
        for (int i = 0; i < percentiles.length; i++) {
            hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
        }
    }
    for (Map.Entry<String, Float> entry : minimums.entrySet()) {
        hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
    }
    context.getCounter("UpdateHostDb", "total_hosts").increment(1);
    // See if this record is to be checked
    if (shouldCheck(hostDatum)) {
        // Make an entry
        resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
        // Add the entry to the queue (blocking)
        try {
            queue.put(resolverThread);
        } catch (InterruptedException e) {
            LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
        }
        // Do not progress, the datum will be written in the resolver thread
        return;
    } else {
        context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
        LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
    }
    // Write the host datum if it wasn't written by the resolver thread
    context.write(key, hostDatum);
}
Also used : HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) NutchWritable(org.apache.nutch.crawl.NutchWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) IntWritable(org.apache.hadoop.io.IntWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FloatWritable(org.apache.hadoop.io.FloatWritable) TDigest(com.tdunning.math.stats.TDigest) HashMap(java.util.HashMap) Map(java.util.Map)

Example 68 with FloatWritable

use of org.apache.hadoop.io.FloatWritable in project nutch by apache.

the class AdaptiveFetchSchedule method setFetchSchedule.

@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
    super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, fetchTime, modifiedTime, state);
    float interval = datum.getFetchInterval();
    long refTime = fetchTime;
    // https://issues.apache.org/jira/browse/NUTCH-1430
    interval = (interval == 0) ? defaultInterval : interval;
    if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
        // Is fetch interval preset in CrawlDatum MD? Then use preset interval
        FloatWritable customIntervalWritable = (FloatWritable) (datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
        interval = customIntervalWritable.get();
    } else {
        if (modifiedTime <= 0)
            modifiedTime = fetchTime;
        switch(state) {
            case FetchSchedule.STATUS_MODIFIED:
                interval *= (1.0f - DEC_RATE);
                modifiedTime = fetchTime;
                break;
            case FetchSchedule.STATUS_NOTMODIFIED:
                interval *= (1.0f + INC_RATE);
                break;
            case FetchSchedule.STATUS_UNKNOWN:
                break;
        }
        if (SYNC_DELTA) {
            // try to synchronize with the time of change
            long delta = (fetchTime - modifiedTime) / 1000L;
            if (delta > interval)
                interval = delta;
            refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
        }
        if (interval < MIN_INTERVAL) {
            interval = MIN_INTERVAL;
        } else if (interval > MAX_INTERVAL) {
            interval = MAX_INTERVAL;
        }
    }
    datum.setFetchInterval(interval);
    datum.setFetchTime(refTime + Math.round(interval * 1000.0));
    datum.setModifiedTime(modifiedTime);
    return datum;
}
Also used : FloatWritable(org.apache.hadoop.io.FloatWritable)

Example 69 with FloatWritable

use of org.apache.hadoop.io.FloatWritable in project nutch by apache.

the class CrawlDbReader method query.

public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
    Map<String, Object> results = new HashMap<>();
    String crawlDb = crawlId + "/crawldb";
    if (type.equalsIgnoreCase("stats")) {
        boolean sort = false;
        if (args.containsKey("sort")) {
            if (args.get("sort").equalsIgnoreCase("true"))
                sort = true;
        }
        TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
        LongWritable totalCnt = (LongWritable) stats.get("T");
        stats.remove("T");
        results.put("totalUrls", String.valueOf(totalCnt.get()));
        Map<String, Object> statusMap = new HashMap<>();
        for (Map.Entry<String, Writable> entry : stats.entrySet()) {
            String k = entry.getKey();
            long val = 0L;
            double fval = 0.0;
            if (entry.getValue() instanceof LongWritable) {
                val = ((LongWritable) entry.getValue()).get();
            } else if (entry.getValue() instanceof FloatWritable) {
                fval = ((FloatWritable) entry.getValue()).get();
            } else if (entry.getValue() instanceof BytesWritable) {
                continue;
            }
            if (k.equals("scn")) {
                results.put("minScore", String.valueOf(fval));
            } else if (k.equals("scx")) {
                results.put("maxScore", String.valueOf(fval));
            } else if (k.equals("sct")) {
                results.put("avgScore", String.valueOf((fval / totalCnt.get())));
            } else if (k.startsWith("status")) {
                String[] st = k.split(" ");
                int code = Integer.parseInt(st[1]);
                if (st.length > 2) {
                    @SuppressWarnings("unchecked") Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
                    Map<String, String> hostValues;
                    if (individualStatusInfo.containsKey("hostValues")) {
                        hostValues = (Map<String, String>) individualStatusInfo.get("hostValues");
                    } else {
                        hostValues = new HashMap<>();
                        individualStatusInfo.put("hostValues", hostValues);
                    }
                    hostValues.put(st[2], String.valueOf(val));
                } else {
                    Map<String, Object> individualStatusInfo = new HashMap<>();
                    individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
                    individualStatusInfo.put("count", String.valueOf(val));
                    statusMap.put(String.valueOf(code), individualStatusInfo);
                }
            } else {
                results.put(k, String.valueOf(val));
            }
        }
        results.put("status", statusMap);
        return results;
    }
    if (type.equalsIgnoreCase("dump")) {
        String output = args.get("out_dir");
        String format = "normal";
        String regex = null;
        Integer retry = null;
        String status = null;
        String expr = null;
        Float sample = null;
        if (args.containsKey("format")) {
            format = args.get("format");
        }
        if (args.containsKey("regex")) {
            regex = args.get("regex");
        }
        if (args.containsKey("retry")) {
            retry = Integer.parseInt(args.get("retry"));
        }
        if (args.containsKey("status")) {
            status = args.get("status");
        }
        if (args.containsKey("expr")) {
            expr = args.get("expr");
        }
        if (args.containsKey("sample")) {
            sample = Float.parseFloat(args.get("sample"));
        }
        processDumpJob(crawlDb, output, conf, format, regex, status, retry, expr, sample);
        File dumpFile = new File(output + "/part-00000");
        return dumpFile;
    }
    if (type.equalsIgnoreCase("topN")) {
        String output = args.get("out_dir");
        long topN = Long.parseLong(args.get("nnn"));
        float min = 0.0f;
        if (args.containsKey("min")) {
            min = Float.parseFloat(args.get("min"));
        }
        processTopNJob(crawlDb, topN, min, output, conf);
        File dumpFile = new File(output + "/part-00000");
        return dumpFile;
    }
    if (type.equalsIgnoreCase("url")) {
        String url = args.get("url");
        CrawlDatum res = get(crawlDb, url, conf);
        results.put("status", res.getStatus());
        results.put("fetchTime", new Date(res.getFetchTime()));
        results.put("modifiedTime", new Date(res.getModifiedTime()));
        results.put("retriesSinceFetch", res.getRetriesSinceFetch());
        results.put("retryInterval", res.getFetchInterval());
        results.put("score", res.getScore());
        results.put("signature", StringUtil.toHexString(res.getSignature()));
        Map<String, String> metadata = new HashMap<>();
        if (res.getMetaData() != null) {
            for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
                metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
            }
        }
        results.put("metadata", metadata);
        return results;
    }
    return results;
}
Also used : HashMap(java.util.HashMap) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) Date(java.util.Date) FloatWritable(org.apache.hadoop.io.FloatWritable) LongWritable(org.apache.hadoop.io.LongWritable) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) SequenceFile(org.apache.hadoop.io.SequenceFile) MapFile(org.apache.hadoop.io.MapFile) File(java.io.File)

Example 70 with FloatWritable

use of org.apache.hadoop.io.FloatWritable in project presto by prestodb.

the class RcFileTester method decodeRecordReaderValue.

private static Object decodeRecordReaderValue(Type type, Object actualValue) {
    if (actualValue instanceof LazyPrimitive) {
        actualValue = ((LazyPrimitive<?, ?>) actualValue).getWritableObject();
    }
    if (actualValue instanceof BooleanWritable) {
        actualValue = ((BooleanWritable) actualValue).get();
    } else if (actualValue instanceof ByteWritable) {
        actualValue = ((ByteWritable) actualValue).get();
    } else if (actualValue instanceof BytesWritable) {
        actualValue = new SqlVarbinary(((BytesWritable) actualValue).copyBytes());
    } else if (actualValue instanceof DateWritable) {
        actualValue = new SqlDate(((DateWritable) actualValue).getDays());
    } else if (actualValue instanceof DoubleWritable) {
        actualValue = ((DoubleWritable) actualValue).get();
    } else if (actualValue instanceof FloatWritable) {
        actualValue = ((FloatWritable) actualValue).get();
    } else if (actualValue instanceof IntWritable) {
        actualValue = ((IntWritable) actualValue).get();
    } else if (actualValue instanceof LongWritable) {
        actualValue = ((LongWritable) actualValue).get();
    } else if (actualValue instanceof ShortWritable) {
        actualValue = ((ShortWritable) actualValue).get();
    } else if (actualValue instanceof HiveDecimalWritable) {
        DecimalType decimalType = (DecimalType) type;
        HiveDecimalWritable writable = (HiveDecimalWritable) actualValue;
        // writable messes with the scale so rescale the values to the Presto type
        BigInteger rescaledValue = rescale(writable.getHiveDecimal().unscaledValue(), writable.getScale(), decimalType.getScale());
        actualValue = new SqlDecimal(rescaledValue, decimalType.getPrecision(), decimalType.getScale());
    } else if (actualValue instanceof Text) {
        actualValue = actualValue.toString();
    } else if (actualValue instanceof TimestampWritable) {
        TimestampWritable timestamp = (TimestampWritable) actualValue;
        if (SESSION.getSqlFunctionProperties().isLegacyTimestamp()) {
            actualValue = new SqlTimestamp((timestamp.getSeconds() * 1000) + (timestamp.getNanos() / 1000000L), UTC_KEY);
        } else {
            actualValue = new SqlTimestamp((timestamp.getSeconds() * 1000) + (timestamp.getNanos() / 1000000L));
        }
    } else if (actualValue instanceof StructObject) {
        StructObject structObject = (StructObject) actualValue;
        actualValue = decodeRecordReaderStruct(type, structObject.getFieldsAsList());
    } else if (actualValue instanceof LazyBinaryArray) {
        actualValue = decodeRecordReaderList(type, ((LazyBinaryArray) actualValue).getList());
    } else if (actualValue instanceof LazyBinaryMap) {
        actualValue = decodeRecordReaderMap(type, ((LazyBinaryMap) actualValue).getMap());
    } else if (actualValue instanceof LazyArray) {
        actualValue = decodeRecordReaderList(type, ((LazyArray) actualValue).getList());
    } else if (actualValue instanceof LazyMap) {
        actualValue = decodeRecordReaderMap(type, ((LazyMap) actualValue).getMap());
    } else if (actualValue instanceof List) {
        actualValue = decodeRecordReaderList(type, ((List<?>) actualValue));
    }
    return actualValue;
}
Also used : SqlVarbinary(com.facebook.presto.common.type.SqlVarbinary) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) SqlTimestamp(com.facebook.presto.common.type.SqlTimestamp) LazyBinaryArray(org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryArray) ShortWritable(org.apache.hadoop.hive.serde2.io.ShortWritable) LazyPrimitive(org.apache.hadoop.hive.serde2.lazy.LazyPrimitive) StructObject(org.apache.hadoop.hive.serde2.StructObject) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) LongWritable(org.apache.hadoop.io.LongWritable) ByteWritable(org.apache.hadoop.io.ByteWritable) IntWritable(org.apache.hadoop.io.IntWritable) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) LazyMap(org.apache.hadoop.hive.serde2.lazy.LazyMap) BytesWritable(org.apache.hadoop.io.BytesWritable) SqlDecimal(com.facebook.presto.common.type.SqlDecimal) Text(org.apache.hadoop.io.Text) FloatWritable(org.apache.hadoop.io.FloatWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) SqlDate(com.facebook.presto.common.type.SqlDate) DecimalType(com.facebook.presto.common.type.DecimalType) BigInteger(java.math.BigInteger) LazyArray(org.apache.hadoop.hive.serde2.lazy.LazyArray) LazyBinaryMap(org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryMap)

Aggregations

FloatWritable (org.apache.hadoop.io.FloatWritable)111 IntWritable (org.apache.hadoop.io.IntWritable)68 LongWritable (org.apache.hadoop.io.LongWritable)65 BooleanWritable (org.apache.hadoop.io.BooleanWritable)54 Text (org.apache.hadoop.io.Text)51 Test (org.junit.Test)49 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)44 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)40 BytesWritable (org.apache.hadoop.io.BytesWritable)40 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)37 Writable (org.apache.hadoop.io.Writable)28 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)27 ArrayList (java.util.ArrayList)24 Configuration (org.apache.hadoop.conf.Configuration)18 HiveCharWritable (org.apache.hadoop.hive.serde2.io.HiveCharWritable)18 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)18 Path (org.apache.hadoop.fs.Path)17 HiveChar (org.apache.hadoop.hive.common.type.HiveChar)17 HiveVarchar (org.apache.hadoop.hive.common.type.HiveVarchar)17 HiveVarcharWritable (org.apache.hadoop.hive.serde2.io.HiveVarcharWritable)17