use of org.apache.hadoop.io.FloatWritable in project nifi by apache.
the class NiFiOrcUtils method convertToORCObject.
public static Object convertToORCObject(TypeInfo typeInfo, Object o) {
if (o != null) {
if (typeInfo instanceof UnionTypeInfo) {
OrcUnion union = new OrcUnion();
// Need to find which of the union types correspond to the primitive object
TypeInfo objectTypeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(ObjectInspectorFactory.getReflectionObjectInspector(o.getClass(), ObjectInspectorFactory.ObjectInspectorOptions.JAVA));
List<TypeInfo> unionTypeInfos = ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos();
int index = 0;
while (index < unionTypeInfos.size() && !unionTypeInfos.get(index).equals(objectTypeInfo)) {
index++;
}
if (index < unionTypeInfos.size()) {
union.set((byte) index, convertToORCObject(objectTypeInfo, o));
} else {
throw new IllegalArgumentException("Object Type for class " + o.getClass().getName() + " not in Union declaration");
}
return union;
}
if (o instanceof Integer) {
return new IntWritable((int) o);
}
if (o instanceof Boolean) {
return new BooleanWritable((boolean) o);
}
if (o instanceof Long) {
return new LongWritable((long) o);
}
if (o instanceof Float) {
return new FloatWritable((float) o);
}
if (o instanceof Double) {
return new DoubleWritable((double) o);
}
if (o instanceof String || o instanceof Utf8 || o instanceof GenericData.EnumSymbol) {
return new Text(o.toString());
}
if (o instanceof ByteBuffer) {
return new BytesWritable(((ByteBuffer) o).array());
}
if (o instanceof int[]) {
int[] intArray = (int[]) o;
return Arrays.stream(intArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("int"), element)).collect(Collectors.toList());
}
if (o instanceof long[]) {
long[] longArray = (long[]) o;
return Arrays.stream(longArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("bigint"), element)).collect(Collectors.toList());
}
if (o instanceof float[]) {
float[] floatArray = (float[]) o;
return IntStream.range(0, floatArray.length).mapToDouble(i -> floatArray[i]).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("float"), (float) element)).collect(Collectors.toList());
}
if (o instanceof double[]) {
double[] doubleArray = (double[]) o;
return Arrays.stream(doubleArray).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("double"), element)).collect(Collectors.toList());
}
if (o instanceof boolean[]) {
boolean[] booleanArray = (boolean[]) o;
return IntStream.range(0, booleanArray.length).map(i -> booleanArray[i] ? 1 : 0).mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("boolean"), element == 1)).collect(Collectors.toList());
}
if (o instanceof GenericData.Array) {
GenericData.Array array = ((GenericData.Array) o);
// The type information in this case is interpreted as a List
TypeInfo listTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
return array.stream().map((element) -> convertToORCObject(listTypeInfo, element)).collect(Collectors.toList());
}
if (o instanceof List) {
return o;
}
if (o instanceof Map) {
Map map = new HashMap();
TypeInfo keyInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
TypeInfo valueInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
// Unions are not allowed as key/value types, so if we convert the key and value objects,
// they should return Writable objects
((Map) o).forEach((key, value) -> {
Object keyObject = convertToORCObject(keyInfo, key);
Object valueObject = convertToORCObject(valueInfo, value);
if (keyObject == null) {
throw new IllegalArgumentException("Maps' key cannot be null");
}
map.put(keyObject, valueObject);
});
return map;
}
if (o instanceof GenericData.Record) {
GenericData.Record record = (GenericData.Record) o;
TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
List<Schema.Field> recordFields = record.getSchema().getFields();
if (recordFields != null) {
Object[] fieldObjects = new Object[recordFields.size()];
for (int i = 0; i < recordFields.size(); i++) {
Schema.Field field = recordFields.get(i);
Schema fieldSchema = field.schema();
Object fieldObject = record.get(field.name());
fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
}
return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
}
}
throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
} else {
return null;
}
}
use of org.apache.hadoop.io.FloatWritable in project nutch by apache.
the class UpdateHostDbReducer method reduce.
/**
*/
public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException {
Map<String, Map<String, Integer>> stringCounts = new HashMap<>();
Map<String, Float> maximums = new HashMap<>();
// used to calc averages
Map<String, Float> sums = new HashMap<>();
// used to calc averages
Map<String, Integer> counts = new HashMap<>();
Map<String, Float> minimums = new HashMap<>();
Map<String, TDigest> tdigests = new HashMap<String, TDigest>();
HostDatum hostDatum = new HostDatum();
float score = 0;
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
stringCounts.put(stringFields[i], new HashMap<>());
}
}
// an empty if this is a new host for the host db
for (Writable value : values) {
// Count crawl datum status's and collect metadata from fields
if (value instanceof CrawlDatum) {
CrawlDatum buffer = (CrawlDatum) value;
// Set the correct status field
switch(buffer.getStatus()) {
case CrawlDatum.STATUS_DB_UNFETCHED:
hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
break;
case CrawlDatum.STATUS_DB_FETCHED:
hostDatum.setFetched(hostDatum.getFetched() + 1);
break;
case CrawlDatum.STATUS_DB_GONE:
hostDatum.setGone(hostDatum.getGone() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_TEMP:
hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
break;
case CrawlDatum.STATUS_DB_REDIR_PERM:
hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
break;
case CrawlDatum.STATUS_DB_NOTMODIFIED:
hostDatum.setNotModified(hostDatum.getNotModified() + 1);
break;
}
// Record connection failures
if (buffer.getRetriesSinceFetch() != 0) {
hostDatum.incConnectionFailures();
}
// Only gather metadata statistics for proper fetched pages
if (buffer.getStatus() == CrawlDatum.STATUS_DB_FETCHED || buffer.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
// Deal with the string fields
if (stringFields != null) {
for (int i = 0; i < stringFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(stringFieldWritables[i]) != null) {
// Get it!
String metadataValue = null;
try {
metadataValue = buffer.getMetaData().get(stringFieldWritables[i]).toString();
} catch (Exception e) {
LOG.error("Metadata field " + stringFields[i] + " is probably not a numeric value");
}
// Does the value exist?
if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
// Yes, increment it
stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
} else {
// Create it!
stringCounts.get(stringFields[i]).put(metadataValue, 1);
}
}
}
}
// Deal with the numeric fields
if (numericFields != null) {
for (int i = 0; i < numericFields.length; i++) {
// Does this field exist?
if (buffer.getMetaData().get(numericFieldWritables[i]) != null) {
try {
// Get it!
Float metadataValue = Float.parseFloat(buffer.getMetaData().get(numericFieldWritables[i]).toString());
// Does the median value exist?
if (tdigests.containsKey(numericFields[i])) {
tdigests.get(numericFields[i]).add(metadataValue);
} else {
// Create it!
TDigest tdigest = TDigest.createDigest(100);
tdigest.add((double) metadataValue);
tdigests.put(numericFields[i], tdigest);
}
// Does the minimum value exist?
if (minimums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue < minimums.get(numericFields[i])) {
minimums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
minimums.put(numericFields[i], metadataValue);
}
// Does the maximum value exist?
if (maximums.containsKey(numericFields[i])) {
// Write if this is lower than existing value
if (metadataValue > maximums.get(numericFields[i])) {
maximums.put(numericFields[i], metadataValue);
}
} else {
// Create it!
maximums.put(numericFields[i], metadataValue);
}
// Sum it up!
if (sums.containsKey(numericFields[i])) {
// Increment
sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
} else {
// Create it!
sums.put(numericFields[i], metadataValue);
counts.put(numericFields[i], 1);
}
} catch (Exception e) {
LOG.error(e.getMessage() + " when processing values for " + key.toString());
}
}
}
}
}
}
//
if (value instanceof HostDatum) {
HostDatum buffer = (HostDatum) value;
// Check homepage URL
if (buffer.hasHomepageUrl()) {
hostDatum.setHomepageUrl(buffer.getHomepageUrl());
}
// Check lastCheck timestamp
if (!buffer.isEmpty()) {
hostDatum.setLastCheck(buffer.getLastCheck());
}
// Check and set DNS failures
if (buffer.getDnsFailures() > 0) {
hostDatum.setDnsFailures(buffer.getDnsFailures());
}
// Check and set connection failures
if (buffer.getConnectionFailures() > 0) {
hostDatum.setConnectionFailures(buffer.getConnectionFailures());
}
// Check metadata
if (!buffer.getMetaData().isEmpty()) {
hostDatum.setMetaData(buffer.getMetaData());
}
// Check and set score (score from Web Graph has precedence)
if (buffer.getScore() > 0) {
hostDatum.setScore(buffer.getScore());
}
}
// Check for the score
if (value instanceof FloatWritable) {
FloatWritable buffer = (FloatWritable) value;
score = buffer.get();
}
}
// Check if score was set from Web Graph
if (score > 0) {
hostDatum.setScore(score);
}
// Set metadata
for (Map.Entry<String, Map<String, Integer>> entry : stringCounts.entrySet()) {
for (Map.Entry<String, Integer> subEntry : entry.getValue().entrySet()) {
hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
}
}
for (Map.Entry<String, Float> entry : maximums.entrySet()) {
hostDatum.getMetaData().put(new Text("max." + entry.getKey()), new FloatWritable(entry.getValue()));
}
for (Map.Entry<String, Float> entry : sums.entrySet()) {
hostDatum.getMetaData().put(new Text("avg." + entry.getKey()), new FloatWritable(entry.getValue() / counts.get(entry.getKey())));
}
for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
// Emit all percentiles
for (int i = 0; i < percentiles.length; i++) {
hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float) entry.getValue().quantile(0.5)));
}
}
for (Map.Entry<String, Float> entry : minimums.entrySet()) {
hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
}
context.getCounter("UpdateHostDb", "total_hosts").increment(1);
// See if this record is to be checked
if (shouldCheck(hostDatum)) {
// Make an entry
resolverThread = new ResolverThread(key.toString(), hostDatum, context, purgeFailedHostsThreshold);
// Add the entry to the queue (blocking)
try {
queue.put(resolverThread);
} catch (InterruptedException e) {
LOG.error("UpdateHostDb: " + StringUtils.stringifyException(e));
}
// Do not progress, the datum will be written in the resolver thread
return;
} else {
context.getCounter("UpdateHostDb", "skipped_not_eligible").increment(1);
LOG.info("UpdateHostDb: " + key.toString() + ": skipped_not_eligible");
}
// Write the host datum if it wasn't written by the resolver thread
context.write(key, hostDatum);
}
use of org.apache.hadoop.io.FloatWritable in project nutch by apache.
the class AdaptiveFetchSchedule method setFetchSchedule.
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, fetchTime, modifiedTime, state);
float interval = datum.getFetchInterval();
long refTime = fetchTime;
// https://issues.apache.org/jira/browse/NUTCH-1430
interval = (interval == 0) ? defaultInterval : interval;
if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
// Is fetch interval preset in CrawlDatum MD? Then use preset interval
FloatWritable customIntervalWritable = (FloatWritable) (datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
interval = customIntervalWritable.get();
} else {
if (modifiedTime <= 0)
modifiedTime = fetchTime;
switch(state) {
case FetchSchedule.STATUS_MODIFIED:
interval *= (1.0f - DEC_RATE);
modifiedTime = fetchTime;
break;
case FetchSchedule.STATUS_NOTMODIFIED:
interval *= (1.0f + INC_RATE);
break;
case FetchSchedule.STATUS_UNKNOWN:
break;
}
if (SYNC_DELTA) {
// try to synchronize with the time of change
long delta = (fetchTime - modifiedTime) / 1000L;
if (delta > interval)
interval = delta;
refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
}
if (interval < MIN_INTERVAL) {
interval = MIN_INTERVAL;
} else if (interval > MAX_INTERVAL) {
interval = MAX_INTERVAL;
}
}
datum.setFetchInterval(interval);
datum.setFetchTime(refTime + Math.round(interval * 1000.0));
datum.setModifiedTime(modifiedTime);
return datum;
}
use of org.apache.hadoop.io.FloatWritable in project nutch by apache.
the class CrawlDbReader method query.
public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception {
Map<String, Object> results = new HashMap<>();
String crawlDb = crawlId + "/crawldb";
if (type.equalsIgnoreCase("stats")) {
boolean sort = false;
if (args.containsKey("sort")) {
if (args.get("sort").equalsIgnoreCase("true"))
sort = true;
}
TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort);
LongWritable totalCnt = (LongWritable) stats.get("T");
stats.remove("T");
results.put("totalUrls", String.valueOf(totalCnt.get()));
Map<String, Object> statusMap = new HashMap<>();
for (Map.Entry<String, Writable> entry : stats.entrySet()) {
String k = entry.getKey();
long val = 0L;
double fval = 0.0;
if (entry.getValue() instanceof LongWritable) {
val = ((LongWritable) entry.getValue()).get();
} else if (entry.getValue() instanceof FloatWritable) {
fval = ((FloatWritable) entry.getValue()).get();
} else if (entry.getValue() instanceof BytesWritable) {
continue;
}
if (k.equals("scn")) {
results.put("minScore", String.valueOf(fval));
} else if (k.equals("scx")) {
results.put("maxScore", String.valueOf(fval));
} else if (k.equals("sct")) {
results.put("avgScore", String.valueOf((fval / totalCnt.get())));
} else if (k.startsWith("status")) {
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2) {
@SuppressWarnings("unchecked") Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
Map<String, String> hostValues;
if (individualStatusInfo.containsKey("hostValues")) {
hostValues = (Map<String, String>) individualStatusInfo.get("hostValues");
} else {
hostValues = new HashMap<>();
individualStatusInfo.put("hostValues", hostValues);
}
hostValues.put(st[2], String.valueOf(val));
} else {
Map<String, Object> individualStatusInfo = new HashMap<>();
individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code));
individualStatusInfo.put("count", String.valueOf(val));
statusMap.put(String.valueOf(code), individualStatusInfo);
}
} else {
results.put(k, String.valueOf(val));
}
}
results.put("status", statusMap);
return results;
}
if (type.equalsIgnoreCase("dump")) {
String output = args.get("out_dir");
String format = "normal";
String regex = null;
Integer retry = null;
String status = null;
String expr = null;
Float sample = null;
if (args.containsKey("format")) {
format = args.get("format");
}
if (args.containsKey("regex")) {
regex = args.get("regex");
}
if (args.containsKey("retry")) {
retry = Integer.parseInt(args.get("retry"));
}
if (args.containsKey("status")) {
status = args.get("status");
}
if (args.containsKey("expr")) {
expr = args.get("expr");
}
if (args.containsKey("sample")) {
sample = Float.parseFloat(args.get("sample"));
}
processDumpJob(crawlDb, output, conf, format, regex, status, retry, expr, sample);
File dumpFile = new File(output + "/part-00000");
return dumpFile;
}
if (type.equalsIgnoreCase("topN")) {
String output = args.get("out_dir");
long topN = Long.parseLong(args.get("nnn"));
float min = 0.0f;
if (args.containsKey("min")) {
min = Float.parseFloat(args.get("min"));
}
processTopNJob(crawlDb, topN, min, output, conf);
File dumpFile = new File(output + "/part-00000");
return dumpFile;
}
if (type.equalsIgnoreCase("url")) {
String url = args.get("url");
CrawlDatum res = get(crawlDb, url, conf);
results.put("status", res.getStatus());
results.put("fetchTime", new Date(res.getFetchTime()));
results.put("modifiedTime", new Date(res.getModifiedTime()));
results.put("retriesSinceFetch", res.getRetriesSinceFetch());
results.put("retryInterval", res.getFetchInterval());
results.put("score", res.getScore());
results.put("signature", StringUtil.toHexString(res.getSignature()));
Map<String, String> metadata = new HashMap<>();
if (res.getMetaData() != null) {
for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue()));
}
}
results.put("metadata", metadata);
return results;
}
return results;
}
use of org.apache.hadoop.io.FloatWritable in project presto by prestodb.
the class RcFileTester method decodeRecordReaderValue.
private static Object decodeRecordReaderValue(Type type, Object actualValue) {
if (actualValue instanceof LazyPrimitive) {
actualValue = ((LazyPrimitive<?, ?>) actualValue).getWritableObject();
}
if (actualValue instanceof BooleanWritable) {
actualValue = ((BooleanWritable) actualValue).get();
} else if (actualValue instanceof ByteWritable) {
actualValue = ((ByteWritable) actualValue).get();
} else if (actualValue instanceof BytesWritable) {
actualValue = new SqlVarbinary(((BytesWritable) actualValue).copyBytes());
} else if (actualValue instanceof DateWritable) {
actualValue = new SqlDate(((DateWritable) actualValue).getDays());
} else if (actualValue instanceof DoubleWritable) {
actualValue = ((DoubleWritable) actualValue).get();
} else if (actualValue instanceof FloatWritable) {
actualValue = ((FloatWritable) actualValue).get();
} else if (actualValue instanceof IntWritable) {
actualValue = ((IntWritable) actualValue).get();
} else if (actualValue instanceof LongWritable) {
actualValue = ((LongWritable) actualValue).get();
} else if (actualValue instanceof ShortWritable) {
actualValue = ((ShortWritable) actualValue).get();
} else if (actualValue instanceof HiveDecimalWritable) {
DecimalType decimalType = (DecimalType) type;
HiveDecimalWritable writable = (HiveDecimalWritable) actualValue;
// writable messes with the scale so rescale the values to the Presto type
BigInteger rescaledValue = rescale(writable.getHiveDecimal().unscaledValue(), writable.getScale(), decimalType.getScale());
actualValue = new SqlDecimal(rescaledValue, decimalType.getPrecision(), decimalType.getScale());
} else if (actualValue instanceof Text) {
actualValue = actualValue.toString();
} else if (actualValue instanceof TimestampWritable) {
TimestampWritable timestamp = (TimestampWritable) actualValue;
if (SESSION.getSqlFunctionProperties().isLegacyTimestamp()) {
actualValue = new SqlTimestamp((timestamp.getSeconds() * 1000) + (timestamp.getNanos() / 1000000L), UTC_KEY);
} else {
actualValue = new SqlTimestamp((timestamp.getSeconds() * 1000) + (timestamp.getNanos() / 1000000L));
}
} else if (actualValue instanceof StructObject) {
StructObject structObject = (StructObject) actualValue;
actualValue = decodeRecordReaderStruct(type, structObject.getFieldsAsList());
} else if (actualValue instanceof LazyBinaryArray) {
actualValue = decodeRecordReaderList(type, ((LazyBinaryArray) actualValue).getList());
} else if (actualValue instanceof LazyBinaryMap) {
actualValue = decodeRecordReaderMap(type, ((LazyBinaryMap) actualValue).getMap());
} else if (actualValue instanceof LazyArray) {
actualValue = decodeRecordReaderList(type, ((LazyArray) actualValue).getList());
} else if (actualValue instanceof LazyMap) {
actualValue = decodeRecordReaderMap(type, ((LazyMap) actualValue).getMap());
} else if (actualValue instanceof List) {
actualValue = decodeRecordReaderList(type, ((List<?>) actualValue));
}
return actualValue;
}
Aggregations