Search in sources :

Example 1 with DocumentAnalysisRequest

use of org.apache.solr.client.solrj.request.DocumentAnalysisRequest in project lucene-solr by apache.

the class DocumentAnalysisRequestHandlerTest method testCharsetOutsideDocument.

// This test should also test charset detection in UpdateRequestHandler,
// but the DocumentAnalysisRequestHandler is simplier to use/check.
@Test
public void testCharsetOutsideDocument() throws Exception {
    final byte[] xmlBytes = ("<docs>\r\n" + " <doc>\r\n" + "  <field name=\"id\">Müller</field>\r\n" + " </doc>" + "</docs>").getBytes(StandardCharsets.ISO_8859_1);
    // we declare a content stream with charset:
    final ContentStream cs = new ByteStream(xmlBytes, "application/xml; charset=ISO-8859-1");
    ModifiableSolrParams params = new ModifiableSolrParams();
    SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) {

        @Override
        public Iterable<ContentStream> getContentStreams() {
            return Collections.singleton(cs);
        }
    };
    DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req);
    assertNotNull(request);
    final List<SolrInputDocument> documents = request.getDocuments();
    assertNotNull(documents);
    assertEquals(1, documents.size());
    SolrInputDocument doc = documents.get(0);
    assertEquals("Müller", doc.getField("id").getValue());
}
Also used : ContentStream(org.apache.solr.common.util.ContentStream) SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrQueryRequestBase(org.apache.solr.request.SolrQueryRequestBase) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocumentAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest) Test(org.junit.Test)

Example 2 with DocumentAnalysisRequest

use of org.apache.solr.client.solrj.request.DocumentAnalysisRequest in project lucene-solr by apache.

the class DocumentAnalysisRequestHandlerTest method testCharsetInDocument.

// This test should also test charset detection in UpdateRequestHandler,
// but the DocumentAnalysisRequestHandler is simplier to use/check.
@Test
public void testCharsetInDocument() throws Exception {
    final byte[] xmlBytes = ("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\r\n" + "<docs>\r\n" + " <doc>\r\n" + "  <field name=\"id\">Müller</field>\r\n" + " </doc>" + "</docs>").getBytes(StandardCharsets.ISO_8859_1);
    // we declare a content stream without charset:
    final ContentStream cs = new ByteStream(xmlBytes, "application/xml");
    ModifiableSolrParams params = new ModifiableSolrParams();
    SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) {

        @Override
        public Iterable<ContentStream> getContentStreams() {
            return Collections.singleton(cs);
        }
    };
    DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req);
    assertNotNull(request);
    final List<SolrInputDocument> documents = request.getDocuments();
    assertNotNull(documents);
    assertEquals(1, documents.size());
    SolrInputDocument doc = documents.get(0);
    assertEquals("Müller", doc.getField("id").getValue());
}
Also used : ContentStream(org.apache.solr.common.util.ContentStream) SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrQueryRequestBase(org.apache.solr.request.SolrQueryRequestBase) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocumentAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest) Test(org.junit.Test)

Example 3 with DocumentAnalysisRequest

use of org.apache.solr.client.solrj.request.DocumentAnalysisRequest in project lucene-solr by apache.

the class DocumentAnalysisRequestHandler method resolveAnalysisRequest.

//================================================ Helper Methods ==================================================
/**
   * Resolves the {@link DocumentAnalysisRequest} from the given solr request.
   *
   * @param req The solr request.
   *
   * @return The resolved document analysis request.
   *
   * @throws IOException        Thrown when reading/parsing the content stream of the request fails.
   * @throws XMLStreamException Thrown when reading/parsing the content stream of the request fails.
   */
DocumentAnalysisRequest resolveAnalysisRequest(SolrQueryRequest req) throws IOException, XMLStreamException {
    DocumentAnalysisRequest request = new DocumentAnalysisRequest();
    SolrParams params = req.getParams();
    String query = params.get(AnalysisParams.QUERY, params.get(CommonParams.Q, null));
    request.setQuery(query);
    boolean showMatch = params.getBool(AnalysisParams.SHOW_MATCH, false);
    request.setShowMatch(showMatch);
    ContentStream stream = extractSingleContentStream(req);
    InputStream is = null;
    XMLStreamReader parser = null;
    try {
        is = stream.getStream();
        final String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        parser = (charset == null) ? inputFactory.createXMLStreamReader(is) : inputFactory.createXMLStreamReader(is, charset);
        while (true) {
            int event = parser.next();
            switch(event) {
                case XMLStreamConstants.END_DOCUMENT:
                    {
                        parser.close();
                        return request;
                    }
                case XMLStreamConstants.START_ELEMENT:
                    {
                        String currTag = parser.getLocalName();
                        if ("doc".equals(currTag)) {
                            log.trace("Reading doc...");
                            SolrInputDocument document = readDocument(parser, req.getSchema());
                            request.addDocument(document);
                        }
                        break;
                    }
            }
        }
    } finally {
        if (parser != null)
            parser.close();
        IOUtils.closeQuietly(is);
    }
}
Also used : ContentStream(org.apache.solr.common.util.ContentStream) SolrInputDocument(org.apache.solr.common.SolrInputDocument) XMLStreamReader(javax.xml.stream.XMLStreamReader) InputStream(java.io.InputStream) SolrParams(org.apache.solr.common.params.SolrParams) DocumentAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest)

Example 4 with DocumentAnalysisRequest

use of org.apache.solr.client.solrj.request.DocumentAnalysisRequest in project lucene-solr by apache.

the class DocumentAnalysisRequestHandlerTest method testResolveAnalysisRequest.

/**
   * Tests the {@link DocumentAnalysisRequestHandler#resolveAnalysisRequest(org.apache.solr.request.SolrQueryRequest)}
   */
@Test
public void testResolveAnalysisRequest() throws Exception {
    String docsInput = "<docs>" + "<doc>" + "<field name=\"id\">1</field>" + "<field name=\"whitetok\">The Whitetok</field>" + "<field name=\"text\">The Text</field>" + "</doc>" + "</docs>";
    final ContentStream cs = new ContentStreamBase.StringStream(docsInput);
    ModifiableSolrParams params = new ModifiableSolrParams();
    params.add("analysis.query", "The Query String");
    params.add("analysis.showmatch", "true");
    SolrQueryRequest req = new SolrQueryRequestBase(h.getCore(), params) {

        @Override
        public Iterable<ContentStream> getContentStreams() {
            return Collections.singleton(cs);
        }
    };
    DocumentAnalysisRequest request = handler.resolveAnalysisRequest(req);
    assertNotNull(request);
    assertTrue(request.isShowMatch());
    assertNotNull(request.getQuery());
    assertEquals("The Query String", request.getQuery());
    List<SolrInputDocument> documents = request.getDocuments();
    assertNotNull(documents);
    assertEquals(1, documents.size());
    SolrInputDocument document = documents.get(0);
    SolrInputField field = document.getField("id");
    assertNotNull(field);
    assertEquals("1", field.getFirstValue());
    field = document.getField("whitetok");
    assertNotNull(field);
    assertEquals("The Whitetok", field.getFirstValue());
    field = document.getField("text");
    assertNotNull(field);
    assertEquals("The Text", field.getFirstValue());
    req.close();
}
Also used : ContentStream(org.apache.solr.common.util.ContentStream) SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) SolrInputDocument(org.apache.solr.common.SolrInputDocument) SolrInputField(org.apache.solr.common.SolrInputField) SolrQueryRequestBase(org.apache.solr.request.SolrQueryRequestBase) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocumentAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest) Test(org.junit.Test)

Example 5 with DocumentAnalysisRequest

use of org.apache.solr.client.solrj.request.DocumentAnalysisRequest in project lucene-solr by apache.

the class DocumentAnalysisRequestHandlerTest method testHandleAnalysisRequest.

/**
   * Tests the {@link DocumentAnalysisRequestHandler#handleAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest,
   * org.apache.solr.schema.IndexSchema)}
   */
@Test
public void testHandleAnalysisRequest() throws Exception {
    SolrInputDocument document = new SolrInputDocument();
    document.addField("id", 1);
    document.addField("whitetok", "Jumping Jack");
    document.addField("text", "The Fox Jumped Over The Dogs");
    DocumentAnalysisRequest request = new DocumentAnalysisRequest().setQuery("JUMPING").setShowMatch(true).addDocument(document);
    NamedList<Object> result = handler.handleAnalysisRequest(request, h.getCore().getLatestSchema());
    assertNotNull("result is null and it shouldn't be", result);
    NamedList<NamedList<NamedList<Object>>> documentResult = (NamedList<NamedList<NamedList<Object>>>) result.get("1");
    assertNotNull("An analysis for document with key '1' should be returned", documentResult);
    // the id field
    NamedList<NamedList<Object>> idResult = documentResult.get("id");
    assertNotNull("an analysis for the 'id' field should be returned", idResult);
    NamedList<Object> queryResult;
    List<NamedList> tokenList;
    NamedList<Object> indexResult;
    NamedList<List<NamedList>> valueResult;
    /*** Much of this test seems invalid for a numeric "id" field
    NamedList<Object> queryResult = idResult.get("query");
    assertEquals("Only the default analyzer should be applied", 1, queryResult.size());
    String name = queryResult.getName(0);
    assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*"));
    List<NamedList> tokenList = (List<NamedList>) queryResult.getVal(0);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, new int[]{1}, null, false));
    NamedList<Object> indexResult = idResult.get("index");

    assertEquals("The id field has only a single value", 1, indexResult.size());
    NamedList<List<NamedList>> valueResult = (NamedList<List<NamedList>>) indexResult.get("1");
    assertEquals("Only the default analyzer should be applied", 1, valueResult.size());
    name = queryResult.getName(0);
    assertTrue("Only the default analyzer should be applied", name.matches("org.apache.solr.schema.FieldType\\$DefaultAnalyzer.*"));
    tokenList = valueResult.getVal(0);
    assertEquals("The 'id' field value has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("1", null, "word", 0, 1, 1, new int[]{1}, null, false));
    ***/
    // the name field
    NamedList<NamedList<Object>> whitetokResult = documentResult.get("whitetok");
    assertNotNull("an analysis for the 'whitetok' field should be returned", whitetokResult);
    queryResult = whitetokResult.get("query");
    tokenList = (List<NamedList>) queryResult.get(MockTokenizer.class.getName());
    assertNotNull("Expecting the 'MockTokenizer' to be applied on the query for the 'whitetok' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "word", 0, 7, 1, new int[] { 1 }, null, false));
    indexResult = whitetokResult.get("index");
    assertEquals("The 'whitetok' field has only a single value", 1, indexResult.size());
    valueResult = (NamedList<List<NamedList>>) indexResult.get("Jumping Jack");
    tokenList = valueResult.getVal(0);
    assertEquals("Expecting 2 tokens to be present", 2, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("Jumping", null, "word", 0, 7, 1, new int[] { 1 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("Jack", null, "word", 8, 12, 2, new int[] { 2 }, null, false));
    // the text field
    NamedList<NamedList<Object>> textResult = documentResult.get("text");
    assertNotNull("an analysis for the 'text' field should be returned", textResult);
    queryResult = textResult.get("query");
    tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.standard.StandardTokenizer");
    assertNotNull("Expecting the 'StandardTokenizer' to be applied on the query for the 'text' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[] { 1 }, null, false));
    tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.standard.StandardFilter");
    assertNotNull("Expecting the 'StandardFilter' to be applied on the query for the 'text' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("JUMPING", null, "<ALPHANUM>", 0, 7, 1, new int[] { 1, 1 }, null, false));
    tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
    assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the query for the 'text' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[] { 1, 1, 1 }, null, false));
    tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.core.StopFilter");
    assertNotNull("Expecting the 'StopFilter' to be applied on the query for the 'text' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("jumping", null, "<ALPHANUM>", 0, 7, 1, new int[] { 1, 1, 1, 1 }, null, false));
    tokenList = (List<NamedList>) queryResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
    assertNotNull("Expecting the 'PorterStemFilter' to be applied on the query for the 'text' field", tokenList);
    assertEquals("Query has only one token", 1, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("jump", null, "<ALPHANUM>", 0, 7, 1, new int[] { 1, 1, 1, 1, 1 }, null, false));
    indexResult = textResult.get("index");
    assertEquals("The 'text' field has only a single value", 1, indexResult.size());
    valueResult = (NamedList<List<NamedList>>) indexResult.get("The Fox Jumped Over The Dogs");
    tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardTokenizer");
    assertNotNull("Expecting the 'StandardTokenizer' to be applied on the index for the 'text' field", tokenList);
    assertEquals("Expecting 6 tokens", 6, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("The", null, "<ALPHANUM>", 0, 3, 1, new int[] { 1 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("Fox", null, "<ALPHANUM>", 4, 7, 2, new int[] { 2 }, null, false));
    assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "<ALPHANUM>", 8, 14, 3, new int[] { 3 }, null, false));
    assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[] { 4 }, null, false));
    assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[] { 5 }, null, false));
    assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[] { 6 }, null, false));
    tokenList = valueResult.get("org.apache.lucene.analysis.standard.StandardFilter");
    assertNotNull("Expecting the 'StandardFilter' to be applied on the index for the 'text' field", tokenList);
    assertEquals("Expecting 6 tokens", 6, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("The", null, "<ALPHANUM>", 0, 3, 1, new int[] { 1, 1 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("Fox", null, "<ALPHANUM>", 4, 7, 2, new int[] { 2, 2 }, null, false));
    assertToken(tokenList.get(2), new TokenInfo("Jumped", null, "<ALPHANUM>", 8, 14, 3, new int[] { 3, 3 }, null, false));
    assertToken(tokenList.get(3), new TokenInfo("Over", null, "<ALPHANUM>", 15, 19, 4, new int[] { 4, 4 }, null, false));
    assertToken(tokenList.get(4), new TokenInfo("The", null, "<ALPHANUM>", 20, 23, 5, new int[] { 5, 5 }, null, false));
    assertToken(tokenList.get(5), new TokenInfo("Dogs", null, "<ALPHANUM>", 24, 28, 6, new int[] { 6, 6 }, null, false));
    tokenList = valueResult.get("org.apache.lucene.analysis.core.LowerCaseFilter");
    assertNotNull("Expecting the 'LowerCaseFilter' to be applied on the index for the 'text' field", tokenList);
    assertEquals("Expecting 6 tokens", 6, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("the", null, "<ALPHANUM>", 0, 3, 1, new int[] { 1, 1, 1 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[] { 2, 2, 2 }, null, false));
    assertToken(tokenList.get(2), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[] { 3, 3, 3 }, null, false));
    assertToken(tokenList.get(3), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[] { 4, 4, 4 }, null, false));
    assertToken(tokenList.get(4), new TokenInfo("the", null, "<ALPHANUM>", 20, 23, 5, new int[] { 5, 5, 5 }, null, false));
    assertToken(tokenList.get(5), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[] { 6, 6, 6 }, null, false));
    tokenList = valueResult.get("org.apache.lucene.analysis.core.StopFilter");
    assertNotNull("Expecting the 'StopFilter' to be applied on the index for the 'text' field", tokenList);
    assertEquals("Expecting 4 tokens after stop word removal", 4, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[] { 2, 2, 2, 2 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("jumped", null, "<ALPHANUM>", 8, 14, 3, new int[] { 3, 3, 3, 3 }, null, false));
    assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[] { 4, 4, 4, 4 }, null, false));
    assertToken(tokenList.get(3), new TokenInfo("dogs", null, "<ALPHANUM>", 24, 28, 6, new int[] { 6, 6, 6, 6 }, null, false));
    tokenList = valueResult.get("org.apache.lucene.analysis.en.PorterStemFilter");
    assertNotNull("Expecting the 'PorterStemFilter' to be applied on the index for the 'text' field", tokenList);
    assertEquals("Expecting 4 tokens", 4, tokenList.size());
    assertToken(tokenList.get(0), new TokenInfo("fox", null, "<ALPHANUM>", 4, 7, 2, new int[] { 2, 2, 2, 2, 2 }, null, false));
    assertToken(tokenList.get(1), new TokenInfo("jump", null, "<ALPHANUM>", 8, 14, 3, new int[] { 3, 3, 3, 3, 3 }, null, true));
    assertToken(tokenList.get(2), new TokenInfo("over", null, "<ALPHANUM>", 15, 19, 4, new int[] { 4, 4, 4, 4, 4 }, null, false));
    assertToken(tokenList.get(3), new TokenInfo("dog", null, "<ALPHANUM>", 24, 28, 6, new int[] { 6, 6, 6, 6, 6 }, null, false));
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) SolrInputDocument(org.apache.solr.common.SolrInputDocument) NamedList(org.apache.solr.common.util.NamedList) NamedList(org.apache.solr.common.util.NamedList) List(java.util.List) DocumentAnalysisRequest(org.apache.solr.client.solrj.request.DocumentAnalysisRequest) Test(org.junit.Test)

Aggregations

DocumentAnalysisRequest (org.apache.solr.client.solrj.request.DocumentAnalysisRequest)5 SolrInputDocument (org.apache.solr.common.SolrInputDocument)5 ContentStream (org.apache.solr.common.util.ContentStream)4 Test (org.junit.Test)4 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)3 SolrQueryRequest (org.apache.solr.request.SolrQueryRequest)3 SolrQueryRequestBase (org.apache.solr.request.SolrQueryRequestBase)3 InputStream (java.io.InputStream)1 List (java.util.List)1 XMLStreamReader (javax.xml.stream.XMLStreamReader)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 SolrInputField (org.apache.solr.common.SolrInputField)1 SolrParams (org.apache.solr.common.params.SolrParams)1 NamedList (org.apache.solr.common.util.NamedList)1