use of org.wso2.carbon.registry.indexing.solr.IndexDocument in project carbon-apimgt by wso2.
the class PDFIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
COSDocument cosDoc = null;
try {
PDFParser parser = getPdfParser(fileData);
parser.parse();
cosDoc = parser.getDocument();
PDFTextStripper stripper = getPdfTextStripper();
String docText = stripper.getText(new PDDocument(cosDoc));
IndexDocument indexDoc = new IndexDocument(fileData.path, docText, null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Collections.singletonList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList("application/pdf"));
}
indexDoc.setFields(fields);
return indexDoc;
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg, e);
} finally {
if (cosDoc != null) {
try {
cosDoc.close();
} catch (IOException e) {
log.error("Failed to close pdf doc stream ", e);
}
}
}
}
use of org.wso2.carbon.registry.indexing.solr.IndexDocument in project carbon-apimgt by wso2.
the class CustomAPIIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(AsyncIndexer.File2Index fileData) throws SolrException, RegistryException {
Registry registry = GovernanceUtils.getGovernanceSystemRegistry(IndexingManager.getInstance().getRegistry(fileData.tenantId));
String resourcePath = fileData.path.substring(RegistryConstants.GOVERNANCE_REGISTRY_BASE_PATH.length());
Resource resource = null;
if (resourcePath.contains("/apimgt/applicationdata/apis/")) {
return null;
}
if (registry.resourceExists(resourcePath)) {
resource = registry.get(resourcePath);
}
if (log.isDebugEnabled()) {
log.debug("CustomAPIIndexer is currently indexing the api at path " + resourcePath);
}
// Here we are adding properties as fields, so that we can search the properties as we do for attributes.
IndexDocument indexDocument = super.getIndexedDocument(fileData);
Map<String, List<String>> fields = indexDocument.getFields();
if (resource != null) {
Properties properties = resource.getProperties();
Enumeration propertyNames = properties.propertyNames();
while (propertyNames.hasMoreElements()) {
String property = (String) propertyNames.nextElement();
if (log.isDebugEnabled()) {
log.debug("API at " + resourcePath + " has " + property + " property");
}
if (property.startsWith(APIConstants.API_RELATED_CUSTOM_PROPERTIES_PREFIX)) {
fields.put((OVERVIEW_PREFIX + property), getLowerCaseList(resource.getPropertyValues(property)));
if (log.isDebugEnabled()) {
log.debug(property + " is added as " + (OVERVIEW_PREFIX + property) + " field for indexing");
}
}
}
indexDocument.setFields(fields);
}
return indexDocument;
}
use of org.wso2.carbon.registry.indexing.solr.IndexDocument in project carbon-apimgt by wso2.
the class MSExcelIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
try {
String excelText = null;
try {
// Extract Excel 2003 (.xsl) document files
ExcelExtractor extractor = getExcelExtractor(fileData);
excelText = extractor.getText();
} catch (OfficeXmlFileException e) {
// if 2003 Excel (.xsl) extraction failed, try with Excel 2007 (.xslx) document files extractor
XSSFExcelExtractor xssfExcelExtractor = getXssfExcelExtractor(fileData);
excelText = xssfExcelExtractor.getText();
} catch (Exception e) {
String msg = "Failed to extract the document";
log.error(msg, e);
}
IndexDocument indexDoc = new IndexDocument(fileData.path, excelText, null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Collections.singletonList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList("application/vnd.ms-excel"));
}
indexDoc.setFields(fields);
return indexDoc;
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg, e);
}
}
use of org.wso2.carbon.registry.indexing.solr.IndexDocument in project carbon-apimgt by wso2.
the class MSWordIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException {
try {
String wordText = null;
try {
// Extract MSWord 2003 document files
POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
WordExtractor msWord2003Extractor = new WordExtractor(fs);
wordText = msWord2003Extractor.getText();
} catch (OfficeXmlFileException e) {
// if 2003 extraction failed, try with MSWord 2007 document files extractor
XWPFDocument doc = new XWPFDocument(new ByteArrayInputStream(fileData.data));
XWPFWordExtractor msWord2007Extractor = new XWPFWordExtractor(doc);
wordText = msWord2007Extractor.getText();
} catch (Exception e) {
// The reason for not throwing an exception is that since this is an indexer that runs in the background
// throwing an exception might lead to adverse behaviors in the client side and might lead to
// other files not being indexed
String msg = "Failed to extract the document while indexing";
log.error(msg, e);
}
IndexDocument indexDoc = new IndexDocument(fileData.path, wordText, null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Collections.singletonList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Collections.singletonList("application/pdf"));
}
indexDoc.setFields(fields);
return indexDoc;
} catch (IOException e) {
String msg = "Failed to write to the index";
log.error(msg, e);
throw new SolrException(ErrorCode.SERVER_ERROR, msg, e);
}
}
use of org.wso2.carbon.registry.indexing.solr.IndexDocument in project carbon-apimgt by wso2.
the class PlainTextIndexer method getIndexedDocument.
public IndexDocument getIndexedDocument(File2Index fileData) throws SolrException, RegistryException {
IndexDocument indexDoc = new IndexDocument(fileData.path, RegistryUtils.decodeBytes(fileData.data), null);
Map<String, List<String>> fields = new HashMap<String, List<String>>();
fields.put("path", Arrays.asList(fileData.path));
if (fileData.mediaType != null) {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList(fileData.mediaType));
} else {
fields.put(IndexingConstants.FIELD_MEDIA_TYPE, Arrays.asList("text/(.)"));
}
indexDoc.setFields(fields);
return indexDoc;
}
Aggregations