org.apache.lucene.analysis.tokenattributes.PayloadAttribute Java Exaples

Source File: TestConcatenatingTokenStream.java From lucene-solr with Apache License 2.0

6 votes

public void testInconsistentAttributes() throws IOException {

    AttributeFactory factory = newAttributeFactory();

    final MockTokenizer first = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    first.setReader(new StringReader("first words "));
    first.addAttribute(PayloadAttribute.class);
    final MockTokenizer second = new MockTokenizer(factory, MockTokenizer.WHITESPACE, false);
    second.setReader(new StringReader("second words"));
    second.addAttribute(FlagsAttribute.class);

    TokenStream ts = new ConcatenatingTokenStream(first, second);
    assertTrue(ts.hasAttribute(FlagsAttribute.class));
    assertTrue(ts.hasAttribute(PayloadAttribute.class));

    assertTokenStreamContents(ts,
        new String[] { "first", "words", "second", "words" },
        new int[]{ 0, 6, 12, 19, },
        new int[]{ 5, 11, 18, 24, });

  }

Source File: DelimitedPayloadTokenFilterTest.java From lucene-solr with Apache License 2.0

6 votes

void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
  CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
  assertTrue(stream.incrementToken());
  assertEquals(expected, termAtt.toString());
  BytesRef payload = payloadAtt.getPayload();
  if (payload != null) {
    assertTrue(payload.length + " does not equal: " + expectPay.length, payload.length == expectPay.length);
    for (int i = 0; i < expectPay.length; i++) {
      assertTrue(expectPay[i] + " does not equal: " + payload.bytes[i + payload.offset], expectPay[i] == payload.bytes[i + payload.offset]);

    }
  } else {
    assertTrue("expectPay is not null and it should be", expectPay == null);
  }
}

Source File: DelimitedPayloadTokenFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testIntEncoding() throws Exception {
  String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
  DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), '|', new IntegerEncoder());
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, payAtt, null);
  assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
  assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeInt(2));
  assertTermEquals("fox", filter, termAtt, payAtt, PayloadHelper.encodeInt(3));
  assertTermEquals("jumped", filter, termAtt, payAtt, null);
  assertTermEquals("over", filter, termAtt, payAtt, null);
  assertTermEquals("the", filter, termAtt, payAtt, null);
  assertTermEquals("lazy", filter, termAtt, payAtt, PayloadHelper.encodeInt(5));
  assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeInt(99));
  assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeInt(83));
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}

Source File: DelimitedPayloadTokenFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testFloatEncoding() throws Exception {
  String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
  DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), '|', new FloatEncoder());
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, payAtt, null);
  assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
  assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeFloat(2.0f));
  assertTermEquals("fox", filter, termAtt, payAtt, PayloadHelper.encodeFloat(3.5f));
  assertTermEquals("jumped", filter, termAtt, payAtt, PayloadHelper.encodeFloat(0.5f));
  assertTermEquals("over", filter, termAtt, payAtt, null);
  assertTermEquals("the", filter, termAtt, payAtt, null);
  assertTermEquals("lazy", filter, termAtt, payAtt, PayloadHelper.encodeFloat(5.0f));
  assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeFloat(99.3f));
  assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeFloat(83.7f));
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}

Source File: DelimitedPayloadTokenFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testPayloads() throws Exception {
  String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
  DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
    (whitespaceMockTokenizer(test), 
     DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
  filter.reset();
  assertTermEquals("The", filter, termAtt, payAtt, null);
  assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("over", filter, termAtt, payAtt, null);
  assertTermEquals("the", filter, termAtt, payAtt, null);
  assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
  assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
  assertFalse(filter.incrementToken());
  filter.end();
  filter.close();
}

Source File: TestDelimitedPayloadTokenFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testDelim() throws Exception {
  Reader reader = new StringReader("the*0.1 quick*0.1 red*0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload",
      "encoder", "float",
      "delimiter", "*").create(stream);
  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Source File: TestDelimitedPayloadTokenFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testEncoder() throws Exception {
  Reader reader = new StringReader("the|0.1 quick|0.1 red|0.1");
  TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  ((Tokenizer)stream).setReader(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    byte[] payData = payAttr.getPayload().bytes;
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Source File: TestNGramFilters.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Test EdgeNGramFilterFactory on tokens with payloads
 */
public void testEdgeNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Source File: TestNGramFilters.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Test NGramFilterFactory on tokens with payloads
 */
public void testNGramFilterPayload() throws Exception {
  Reader reader = new StringReader("test|0.1");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
  stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);

  stream.reset();
  while (stream.incrementToken()) {
    PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
    assertNotNull(payAttr);
    BytesRef payData = payAttr.getPayload();
    assertNotNull(payData);
    float payFloat = PayloadHelper.decodeFloat(payData.bytes);
    assertEquals(0.1f, payFloat, 0.0f);
  }
  stream.end();
  stream.close();
}

Source File: TestSnowball.java From lucene-solr with Apache License 2.0

6 votes

public void testFilterTokens() throws Exception {
  SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
  CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
  OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
  TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
  FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
  
  filter.incrementToken();

  assertEquals("accent", termAtt.toString());
  assertEquals(2, offsetAtt.startOffset());
  assertEquals(7, offsetAtt.endOffset());
  assertEquals("wrd", typeAtt.type());
  assertEquals(3, posIncAtt.getPositionIncrement());
  assertEquals(77, flagsAtt.getFlags());
  assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload());
}

Source File: SpellingQueryConverter.java From lucene-solr with Apache License 2.0

6 votes

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
  TokenStream stream = analyzer.tokenStream("", text);
  // TODO: support custom attributes
  CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
  TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
  PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
  PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
  OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {      
    Token token = new Token();
    token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    token.setOffset(offset + offsetAtt.startOffset(), 
                    offset + offsetAtt.endOffset());
    token.setFlags(flagsAttValue); //overwriting any flags already set...
    token.setType(typeAtt.type());
    token.setPayload(payloadAtt.getPayload());
    token.setPositionIncrement(posIncAtt.getPositionIncrement());
    result.add(token);
  }
  stream.end();
  stream.close();
}

Source File: PayloadTokenizer.java From clue with Apache License 2.0

5 votes

public PayloadTokenizer(String text)
        throws IOException {
  setReader(new StringReader(text));
  this.tokens = text.toLowerCase().split(",");
  
  termAttr = addAttribute(CharTermAttribute.class);
  termAttr.resizeBuffer(text.length()); // maximum size necessary is the size of the input
  payloadAttr = addAttribute(PayloadAttribute.class);
  payload = new BytesRef(new byte[4]);
  positionAttr = addAttribute(PositionIncrementAttribute.class);
  offsetAttr = addAttribute(OffsetAttribute.class);
}

Source File: TestAnalyzers.java From lucene-solr with Apache License 2.0

5 votes

void verifyPayload(TokenStream ts) throws IOException {
  PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
  ts.reset();
  for(byte b=1;;b++) {
    boolean hasNext = ts.incrementToken();
    if (!hasNext) break;
    // System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
    // System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
    assertEquals(b, payloadAtt.getPayload().bytes[0]);
  }
}

Source File: MockPayloadAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public MockPayloadFilter(TokenStream input, String fieldName) {
  super(input);
  this.fieldName = fieldName;
  pos = 0;
  i = 0;
  posIncrAttr = input.addAttribute(PositionIncrementAttribute.class);
  payloadAttr = input.addAttribute(PayloadAttribute.class);
  termAttr = input.addAttribute(CharTermAttribute.class);
}

Source File: TestPayloads.java From lucene-solr with Apache License 2.0

5 votes

PoolingPayloadTokenStream(ByteArrayPool pool) {
  this.pool = pool;
  payload = pool.get();
  generateRandomData(payload);
  term = new String(payload, 0, payload.length, utf8);
  first = true;
  payloadAtt = addAttribute(PayloadAttribute.class);
  termAtt = addAttribute(CharTermAttribute.class);
}

Source File: FieldInvertState.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Sets attributeSource to a new instance.
 */
void setAttributeSource(AttributeSource attributeSource) {
  if (this.attributeSource != attributeSource) {
    this.attributeSource = attributeSource;
    termAttribute = attributeSource.getAttribute(TermToBytesRefAttribute.class);
    termFreqAttribute = attributeSource.addAttribute(TermFrequencyAttribute.class);
    posIncrAttribute = attributeSource.addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = attributeSource.addAttribute(OffsetAttribute.class);
    payloadAttribute = attributeSource.getAttribute(PayloadAttribute.class);
  }
}

Source File: TestPayloads.java From lucene-solr with Apache License 2.0

5 votes

public PayloadFilter(TokenStream in, String fieldName, Map<String,PayloadData> fieldToData) {
  super(in);
  this.fieldToData = fieldToData;
  this.fieldName = fieldName;
  payloadAtt = addAttribute(PayloadAttribute.class);
  termAttribute = addAttribute(CharTermAttribute.class);
}

Source File: DelimitedPayloadTokenFilterTest.java From lucene-solr with Apache License 2.0

5 votes

void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
  assertTrue(stream.incrementToken());
  assertEquals(expected, termAtt.toString());
  BytesRef payload = payAtt.getPayload();
  if (payload != null) {
    assertTrue(payload.length + " does not equal: " + expectPay.length, payload.length == expectPay.length);
    for (int i = 0; i < expectPay.length; i++) {
      assertTrue(expectPay[i] + " does not equal: " + payload.bytes[i + payload.offset], expectPay[i] == payload.bytes[i + payload.offset]);

    }
  } else {
    assertTrue("expectPay is not null and it should be", expectPay == null);
  }
}

Source File: TestPayloadSpanUtil.java From lucene-solr with Apache License 2.0

5 votes

public PayloadFilter(TokenStream input) {
  super(input);
  pos = 0;
  entities.add("xx");
  entities.add("one");
  nopayload.add("nopayload");
  nopayload.add("np");
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
}

Source File: SimpleQueryConverter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Collection<Token> convert(String origQuery) {
  Collection<Token> result = new HashSet<>();

  try (WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.tokenStream("", origQuery)) {
    // TODO: support custom attributes
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
    FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
    PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);

    ts.reset();

    while (ts.incrementToken()) {
      Token tok = new Token();
      tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
      tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
      tok.setFlags(flagsAtt.getFlags());
      tok.setPayload(payloadAtt.getPayload());
      tok.setPositionIncrement(posIncAtt.getPositionIncrement());
      tok.setType(typeAtt.type());
      result.add(tok);
    }
    ts.end();      
    return result;
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}

Source File: TestPayloadSpans.java From lucene-solr with Apache License 2.0

5 votes

public PayloadFilter(TokenStream input) {
  super(input);
  pos = 0;
  entities.add("xx");
  entities.add("one");
  nopayload.add("nopayload");
  nopayload.add("np");
  termAtt = addAttribute(CharTermAttribute.class);
  posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  payloadAtt = addAttribute(PayloadAttribute.class);
}

Source File: SimplePreAnalyzedParser.java From lucene-solr with Apache License 2.0

4 votes

@Override
public String toFormattedString(Field f) throws IOException {
  StringBuilder sb = new StringBuilder();
  sb.append(VERSION + " ");
  if (f.fieldType().stored()) {
    String s = f.stringValue();
    if (s != null) {
      // encode the equals sign
      s = s.replaceAll("=", "\\=");
      sb.append('=');
      sb.append(s);
      sb.append('=');
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    StringBuilder tok = new StringBuilder();
    boolean next = false;
    while (ts.incrementToken()) {
      if (next) {
        sb.append(' ');
      } else {
        next = true;
      }
      tok.setLength(0);
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = escape(catt.buffer(), catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
          tTerm = escape(tTermChars, tTermChars.length);
        } else {
          if (tok.length() > 0) tok.append(',');
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.append("f=").append(Integer.toHexString(((FlagsAttribute) att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.append("s=").append(((OffsetAttribute) att).startOffset()).append(",e=").append(((OffsetAttribute) att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.append("p=").append(bytesToHex(p.bytes, p.offset, p.length));
            } else if (tok.length() > 0) {
              tok.setLength(tok.length() - 1); // remove the last comma
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.append("i=").append(((PositionIncrementAttribute) att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.append("y=").append(escape(((TypeAttribute) att).type()));
          } else {
            
            tok.append(cl.getName()).append('=').append(escape(att.toString()));
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        if (tok.length() > 0) {
          tok.insert(0, term + ",");
        } else {
          tok.insert(0, term);
        }
      }
      sb.append(tok);
    }
  }
  return sb.toString();
}

Source File: SimplePreAnalyzedParser.java From lucene-solr with Apache License 2.0

4 votes

private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
  a.clearAttributes();
  CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
  char[] tokChars = state.token.toString().toCharArray();
  termAtt.copyBuffer(tokChars, 0, tokChars.length);
  int tokenStart = tokenEnd - state.token.length();
  for (Entry<String, String> e : state.attr.entrySet()) {
    String k = e.getKey();
    if (k.equals("i")) {
      // position increment
      int incr = Integer.parseInt(e.getValue());
      PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
      posIncr.setPositionIncrement(incr);
    } else if (k.equals("s")) {
      tokenStart = Integer.parseInt(e.getValue());
    } else if (k.equals("e")) {
      tokenEnd = Integer.parseInt(e.getValue());
    } else if (k.equals("y")) {
      TypeAttribute type = a.addAttribute(TypeAttribute.class);
      type.setType(e.getValue());
    } else if (k.equals("f")) {
      FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
      int f = Integer.parseInt(e.getValue(), 16);
      flags.setFlags(f);
    } else if (k.equals("p")) {
      PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
      byte[] data = hexToBytes(e.getValue());
      if (data != null && data.length > 0) {
        p.setPayload(new BytesRef(data));
      }
    } else {
      // unknown attribute
    }
  }
  // handle offset attr
  OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
  offset.setOffset(tokenStart, tokenEnd);
  State resState = a.captureState();
  a.clearAttributes();
  return resState;
}

Source File: JsonPreAnalyzedParser.java From lucene-solr with Apache License 2.0

4 votes

@Override
public String toFormattedString(Field f) throws IOException {
  Map<String,Object> map = new LinkedHashMap<>();
  map.put(VERSION_KEY, VERSION);
  if (f.fieldType().stored()) {
    String stringValue = f.stringValue();
    if (stringValue != null) {
      map.put(STRING_KEY, stringValue);
    }
    BytesRef binaryValue = f.binaryValue();
    if (binaryValue != null) {
      map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
    }
  }
  TokenStream ts = f.tokenStreamValue();
  if (ts != null) {
    List<Map<String,Object>> tokens = new LinkedList<>();
    while (ts.incrementToken()) {
      Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
      String cTerm = null;
      String tTerm = null;
      Map<String,Object> tok = new TreeMap<>();
      while (it.hasNext()) {
        Class<? extends Attribute> cl = it.next();
        Attribute att = ts.getAttribute(cl);
        if (att == null) {
          continue;
        }
        if (cl.isAssignableFrom(CharTermAttribute.class)) {
          CharTermAttribute catt = (CharTermAttribute)att;
          cTerm = new String(catt.buffer(), 0, catt.length());
        } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
          TermToBytesRefAttribute tatt = (TermToBytesRefAttribute)att;
          tTerm = tatt.getBytesRef().utf8ToString();
        } else {
          if (cl.isAssignableFrom(FlagsAttribute.class)) {
            tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute)att).getFlags()));
          } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
            tok.put(OFFSET_START_KEY, ((OffsetAttribute)att).startOffset());
            tok.put(OFFSET_END_KEY, ((OffsetAttribute)att).endOffset());
          } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
            BytesRef p = ((PayloadAttribute)att).getPayload();
            if (p != null && p.length > 0) {
              tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
            }
          } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
            tok.put(POSINCR_KEY, ((PositionIncrementAttribute)att).getPositionIncrement());
          } else if (cl.isAssignableFrom(TypeAttribute.class)) {
            tok.put(TYPE_KEY, ((TypeAttribute)att).type());
          } else {
            tok.put(cl.getName(), att.toString());
          }
        }
      }
      String term = null;
      if (cTerm != null) {
        term = cTerm;
      } else {
        term = tTerm;
      }
      if (term != null && term.length() > 0) {
        tok.put(TOKEN_KEY, term);
      }
      tokens.add(tok);
    }
    map.put(TOKENS_KEY, tokens);
  }
  return JSONUtil.toJSON(map, -1);
}

Source File: Token.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void reflectWith(AttributeReflector reflector) {
  super.reflectWith(reflector);
  reflector.reflect(FlagsAttribute.class, "flags", flags);
  reflector.reflect(PayloadAttribute.class, "payload", payload);
}

Source File: Token.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void copyTo(AttributeImpl target) {
  super.copyTo(target);
  ((FlagsAttribute) target).setFlags(flags);
  ((PayloadAttribute) target).setPayload((payload == null) ? null : BytesRef.deepCopyOf(payload));
}

Source File: MtasPreAnalyzedParser.java From mtas with Apache License 2.0

4 votes

@Override
public ParseResult parse(Reader reader, AttributeSource parent)
    throws IOException {
  ParseResult res = new ParseResult();

  // get MtasUpdateRequestProcessorResult
  StringBuilder sb = new StringBuilder();
  char[] buf = new char[128];
  int cnt;
  while ((cnt = reader.read(buf)) > 0) {
    sb.append(buf, 0, cnt);
  }
  Iterator<MtasUpdateRequestProcessorResultItem> iterator;

  try (
      MtasUpdateRequestProcessorResultReader result = new MtasUpdateRequestProcessorResultReader(
          sb.toString());) {
    iterator = result.getIterator();
    if (iterator != null && iterator.hasNext()) {
      res.str = result.getStoredStringValue();
      res.bin = result.getStoredBinValue();
    } else {
      res.str = null;
      res.bin = null;
      result.close();
      return res;
    }
    parent.clearAttributes();
    while (iterator.hasNext()) {
      MtasUpdateRequestProcessorResultItem item = iterator.next();
      if (item.tokenTerm != null) {
        CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
        catt.append(item.tokenTerm);
      }
      if (item.tokenFlags != null) {
        FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
        flags.setFlags(item.tokenFlags);
      }
      if (item.tokenPosIncr != null) {
        PositionIncrementAttribute patt = parent
            .addAttribute(PositionIncrementAttribute.class);
        patt.setPositionIncrement(item.tokenPosIncr);
      }
      if (item.tokenPayload != null) {
        PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
        p.setPayload(new BytesRef(item.tokenPayload));
      }
      if (item.tokenOffsetStart != null && item.tokenOffsetEnd != null) {
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        offset.setOffset(item.tokenOffsetStart, item.tokenOffsetEnd);
      }
      // capture state and add to result
      State state = parent.captureState();
      res.states.add(state.clone());
      // reset for reuse
      parent.clearAttributes();
    }
  } catch (IOException e) {
    // ignore
    log.debug(e);
  }
  return res;
}

Source File: OpenNLPTokenizer.java From jate with GNU Lesser General Public License v3.0

4 votes

public void addPayloadAttribute(PayloadAttribute attribute, MWEMetadata ctx) {
    String data = MWEMetadata.serialize(ctx);
    attribute.setPayload(new BytesRef(data));
}

Source File: MWEFilter.java From jate with GNU Lesser General Public License v3.0

4 votes

public void addPayloadAttribute(PayloadAttribute attribute, MWEMetadata ctx) {
    String data = MWEMetadata.serialize(ctx);
    attribute.setPayload(new BytesRef(data));
}

Source File: BaseTermVectorsFormatTestCase.java From lucene-solr with Apache License 2.0

4 votes

public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
  terms = new String[len];
  termBytes = new BytesRef[len];
  positionsIncrements = new int[len];
  positions = new int[len];
  startOffsets = new int[len];
  endOffsets = new int[len];
  payloads = new BytesRef[len];
  for (int i = 0; i < len; ++i) {
    final int o = random().nextInt(sampleTerms.length);
    terms[i] = sampleTerms[o];
    termBytes[i] = sampleTermBytes[o];
    positionsIncrements[i] = TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
    if (i == 0) {
      startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
    } else {
      startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
    }
    endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
  }

  for (int i = 0; i < len; ++i) {
    if (i == 0) {
      positions[i] = positionsIncrements[i] - 1;
    } else {
      positions[i] = positions[i - 1] + positionsIncrements[i];
    }
  }
  if (rarely()) {
    Arrays.fill(payloads, randomPayload());
  } else {
    for (int i = 0; i < len; ++i) {
      payloads[i] = randomPayload();
    }
  }

  positionToTerms = new HashMap<>(len);
  startOffsetToTerms = new HashMap<>(len);
  for (int i = 0; i < len; ++i) {
    if (!positionToTerms.containsKey(positions[i])) {
      positionToTerms.put(positions[i], new HashSet<Integer>(1));
    }
    positionToTerms.get(positions[i]).add(i);
    if (!startOffsetToTerms.containsKey(startOffsets[i])) {
      startOffsetToTerms.put(startOffsets[i], new HashSet<Integer>(1));
    }
    startOffsetToTerms.get(startOffsets[i]).add(i);
  }

  freqs = new HashMap<>();
  for (String term : terms) {
    if (freqs.containsKey(term)) {
      freqs.put(term, freqs.get(term) + 1);
    } else {
      freqs.put(term, 1);
    }
  }

  addAttributeImpl(new PermissiveOffsetAttributeImpl());

  termAtt = addAttribute(CharTermAttribute.class);
  piAtt = addAttribute(PositionIncrementAttribute.class);
  oAtt = addAttribute(OffsetAttribute.class);
  pAtt = addAttribute(PayloadAttribute.class);
}

org.apache.lucene.analysis.tokenattributes.PayloadAttribute Java Examples