use of com.univocity.parsers.tsv.TsvParserSettings in project QueryAnalysis by Wikidata.
the class Main method loadPreBuildQueryTypes.
/**
* Loads all pre-build query types.
*/
public static void loadPreBuildQueryTypes() {
try (DirectoryStream<Path> directoryStream = Files.newDirectoryStream(Paths.get("preBuildQueryTypeFiles"))) {
for (Path filePath : directoryStream) {
if (Files.isRegularFile(filePath)) {
if (filePath.toString().endsWith(".preBuildQueryType")) {
String queryString = new String(readAllBytes(filePath));
OpenRDFQueryHandler queryHandler = new OpenRDFQueryHandler(QueryHandler.Validity.DEFAULT, -1L, -1, queryString, "preBuildQueryTypes", "", -1);
if (queryHandler.getValidityStatus() != QueryHandler.Validity.VALID) {
logger.info("The Pre-build query " + filePath + " is no valid SPARQL");
continue;
}
ParsedQuery normalizedPreBuildQuery = queryHandler.getNormalizedQuery();
String queryTypeName = filePath.toString().substring(filePath.toString().lastIndexOf("/") + 1, filePath.toString().lastIndexOf("."));
if (normalizedPreBuildQuery != null) {
String queryDump = normalizedPreBuildQuery.getTupleExpr().toString();
byte[] md5 = DigestUtils.md5(queryDump);
int index = Math.floorMod(queryDump.hashCode(), numberOfQueryTypeDiskMaps);
if (queryTypes[index].containsKey(md5)) {
String existingName = queryTypes[index].get(md5);
if (!existingName.equals(queryTypeName)) {
logger.info(queryTypes[index].get(md5) + " is duplicate of " + queryTypeName);
}
}
queryTypes[index].put(md5, queryTypeName);
} else {
logger.info("Pre-build query " + queryTypeName + " could not be parsed.");
}
}
if (filePath.toString().endsWith(".tsv")) {
TsvParserSettings parserSettings = new TsvParserSettings();
parserSettings.setLineSeparatorDetectionEnabled(true);
parserSettings.setHeaderExtractionEnabled(true);
parserSettings.setSkipEmptyLines(true);
parserSettings.setReadInputOnSeparateThread(true);
ObjectRowProcessor rowProcessor = new ObjectRowProcessor() {
@Override
public void rowProcessed(Object[] row, ParsingContext parsingContext) {
if (row.length <= 1) {
logger.warn("Ignoring line without tab while parsing.");
return;
}
if (row.length == 5) {
queryTypeToToolMapping.put(new Tuple2<>(row[0].toString(), row[1].toString()), new Tuple2<>(row[2].toString(), row[3].toString()));
return;
}
logger.warn("Line with row length " + row.length + " found. Is the formatting of toolMapping.tsv correct?");
return;
}
};
parserSettings.setProcessor(rowProcessor);
TsvParser parser = new TsvParser(parserSettings);
parser.parse(filePath.toFile());
}
}
}
} catch (IOException e) {
logger.error("Could not read from directory inputData/queryType/premadeQueryTypeFiles", e);
}
}
use of com.univocity.parsers.tsv.TsvParserSettings in project QueryAnalysis by Wikidata.
the class Anonymizer method loadAllowedUserAgents.
public static void loadAllowedUserAgents() {
try {
InputStreamReader reader = new InputStreamReader(new FileInputStream("anonymization/allowedUserAgents"));
TsvParserSettings parserSettings = new TsvParserSettings();
parserSettings.setLineSeparatorDetectionEnabled(true);
parserSettings.setHeaderExtractionEnabled(true);
parserSettings.setSkipEmptyLines(true);
parserSettings.setReadInputOnSeparateThread(true);
ObjectRowProcessor rowProcessor = new ObjectRowProcessor() {
@Override
public void rowProcessed(Object[] row, ParsingContext parsingContext) {
if (row.length <= 1) {
logger.warn("Ignoring line without tab while parsing.");
return;
}
allowedUserAgents.put(row[1].toString(), row[0].toString());
}
};
parserSettings.setProcessor(rowProcessor);
TsvParser parser = new TsvParser(parserSettings);
parser.parse(reader);
} catch (FileNotFoundException e) {
logger.error("Could not read the allowed user agent string mapping.", e);
}
}
use of com.univocity.parsers.tsv.TsvParserSettings in project eol-globi-data by jhpoelen.
the class DatasetImporterForHurlbert method importStudy.
@Override
public void importStudy() throws StudyImporterException {
;
try (InputStream resource = getDataset().retrieve(RESOURCE)) {
setCurrentResource(RESOURCE);
Set<String> regions = new HashSet<String>();
Set<String> locales = new HashSet<String>();
Set<String> habitats = new HashSet<String>();
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.setHeaderExtractionEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.beginParsing(resource, CharsetConstant.UTF8);
Record record;
while ((record = parser.parseNextRecord()) != null) {
setCurrentLine(parser.getContext().currentLine());
String columnNameSource = "Source";
String sourceCitation = columnValueOrNull(record, columnNameSource);
if (StringUtils.isBlank(sourceCitation)) {
LOG.warn(createMsg("failed to extract source from column [" + columnNameSource + "] in [" + RESOURCE + "]"));
} else {
importRecords(regions, locales, habitats, record, sourceCitation);
}
}
} catch (IOException e) {
throw new StudyImporterException("failed to access [" + RESOURCE + "]", e);
}
}
use of com.univocity.parsers.tsv.TsvParserSettings in project eol-globi-data by jhpoelen.
the class DatasetImporterForFishbase3 method handleTsvInputStream.
public static void handleTsvInputStream(RecordListener listener, InputStream is) throws StudyImporterException {
try (InputStream inputStream = is) {
TsvParserSettings settings = new TsvParserSettings();
settings.getFormat().setLineSeparator("\n");
settings.setMaxCharsPerColumn(4096 * 8);
settings.setHeaderExtractionEnabled(true);
TsvParser parser = new TsvParser(settings);
parser.beginParsing(inputStream, CharsetConstant.UTF8);
Record record;
while ((record = parser.parseNextRecord()) != null) {
listener.onRecord(record);
}
} catch (IOException e) {
throw new StudyImporterException("failed to import tsv stream", e);
}
}
use of com.univocity.parsers.tsv.TsvParserSettings in project QueryAnalysis by Wikidata.
the class Main method loadPropertyGroupMapping.
/**
* Loads the mapping of property to groups.
*/
private static void loadPropertyGroupMapping() {
TsvParserSettings parserSettings = new TsvParserSettings();
parserSettings.setLineSeparatorDetectionEnabled(true);
parserSettings.setHeaderExtractionEnabled(true);
parserSettings.setSkipEmptyLines(true);
parserSettings.setReadInputOnSeparateThread(true);
ObjectRowProcessor rowProcessor = new ObjectRowProcessor() {
@Override
public void rowProcessed(Object[] row, ParsingContext parsingContext) {
if (row.length <= 1) {
logger.warn("Ignoring line without tab while parsing.");
return;
}
if (row.length == 2) {
if (row[1] == null) {
return;
}
propertyGroupMapping.put(row[0].toString(), new HashSet<String>(Arrays.asList(row[1].toString().split(","))));
return;
}
logger.warn("Line with row length " + row.length + " found. Is the formatting of propertyGroupMapping.tsv correct?");
return;
}
};
parserSettings.setProcessor(rowProcessor);
TsvParser parser = new TsvParser(parserSettings);
File file = new File("propertyClassification/propertyGroupMapping.tsv");
parser.parse(file);
}
Aggregations