org.dmg.pmml.TextIndex Java Exaples

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

6 votes

@Override
public List<String> process(){
	TextIndex textIndex = getTextIndex();
	FieldValue value = getValue();

	Cache<FieldValue, List<String>> termTokenCache = CacheUtil.getValue(textIndex, TextUtil.termTokenCaches, TextUtil.termTokenCacheLoader);

	List<String> tokens = termTokenCache.getIfPresent(value);
	if(tokens == null){
		String string = value.asString();

		tokens = TextUtil.tokenize(textIndex, string);

		termTokenCache.put(value, tokens);
	}

	return tokens;
}

Source File: UnsupportedMarkupInspector.java From jpmml-evaluator with GNU Affero General Public License v3.0

6 votes

@Override
public VisitorAction visit(TextIndex textIndex){
	boolean tokenize = textIndex.isTokenize();
	if(!tokenize){
		report(new UnsupportedAttributeException(textIndex, PMMLAttributes.TEXTINDEX_TOKENIZE, false));
	}

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case AUGMENTED_NORMALIZED_TERM_FREQUENCY:
			report(new UnsupportedAttributeException(textIndex, localTermWeights));
			break;
		default:
			break;
	}

	return super.visit(textIndex);
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

6 votes

@Override
public List<String> process(){
	TextIndex textIndex = getTextIndex();
	FieldValue value = getValue();

	Cache<FieldValue, List<String>> textTokenCache = CacheUtil.getValue(textIndex, TextUtil.textTokenCaches, TextUtil.textTokenCacheLoader);

	List<String> tokens = textTokenCache.getIfPresent(value);
	if(tokens == null){
		String string = TextUtil.normalize(textIndex, value.asString());

		tokens = TextUtil.tokenize(textIndex, string);

		textTokenCache.put(value, tokens);
	}

	return tokens;
}

Source File: ExpressionUtilTest.java From jpmml-evaluator with GNU Affero General Public License v3.0

5 votes

@Test
public void evaluateTextIndexNormalization(){
	FieldName name = FieldName.create("x");

	TextIndexNormalization stepOne = new TextIndexNormalization();

	List<List<String>> cells = Arrays.asList(
		Arrays.asList("interfaces?", "interface", "true"),
		Arrays.asList("is|are|seem(ed|s?)|were", "be", "true"),
		Arrays.asList("user friendl(y|iness)", "user_friendly", "true")
	);

	stepOne.setInlineTable(createInlineTable(cells, stepOne));

	TextIndexNormalization stepTwo = new TextIndexNormalization()
		.setInField("re")
		.setOutField("feature");

	cells = Arrays.asList(
		Arrays.asList("interface be (user_friendly|well designed|excellent)", "ui_good", "true")
	);

	stepTwo.setInlineTable(createInlineTable(cells, stepTwo));

	TextIndex textIndex = new TextIndex(name, new Constant("ui_good"))
		.setLocalTermWeights(TextIndex.LocalTermWeights.BINARY)
		.setCaseSensitive(false)
		.addTextIndexNormalizations(stepOne, stepTwo);

	assertEquals(1, evaluate(textIndex, name, "Testing the app for a few days convinced me the interfaces are excellent!"));
}

Source File: ExpressionUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

5 votes

static
public FieldValue evaluateTextIndex(TextIndex textIndex, EvaluationContext context){
	FieldName textName = textIndex.getTextField();
	if(textName == null){
		throw new MissingAttributeException(textIndex, PMMLAttributes.TEXTINDEX_TEXTFIELD);
	}

	FieldValue textValue = context.evaluate(textName);

	FieldValue termValue = ExpressionUtil.evaluateExpressionContainer(textIndex, context);

	// See http://mantis.dmg.org/view.php?id=171
	if(FieldValueUtil.isMissing(textValue) || FieldValueUtil.isMissing(termValue)){
		return FieldValues.MISSING_VALUE;
	}

	TextUtil.TextProcessor textProcessor = new TextUtil.TextProcessor(textIndex, textValue);

	List<String> textTokens = textProcessor.process();

	TextUtil.TermProcessor termProcessor = new TextUtil.TermProcessor(textIndex, termValue);

	List<String> termTokens = termProcessor.process();

	int termFrequency = TextUtil.termFrequency(textIndex, textTokens, termTokens);

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case BINARY:
		case TERM_FREQUENCY:
			return FieldValueUtil.create(TypeInfos.CONTINUOUS_INTEGER, termFrequency);
		case LOGARITHMIC:
			return FieldValueUtil.create(TypeInfos.CONTINUOUS_DOUBLE, Math.log10(1d + termFrequency));
		default:
			throw new UnsupportedAttributeException(textIndex, localTermWeights);
	}
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

5 votes

static
public String normalize(TextIndex textIndex, String string){

	if(textIndex.hasTextIndexNormalizations()){
		List<TextIndexNormalization> textIndexNormalizations = textIndex.getTextIndexNormalizations();

		for(TextIndexNormalization textIndexNormalization : textIndexNormalizations){
			string = TextUtil.normalize(textIndex, textIndexNormalization, string);
		}
	}

	return string;
}

Source File: CountVectorizerModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0

5 votes

@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	CountVectorizerModel transformer = getTransformer();

	DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol());

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(documentFeature.getWordSeparatorRE())
		.setLocalTermWeights(transformer.getBinary() ? TextIndex.LocalTermWeights.BINARY : null);

	Set<DocumentFeature.StopWordSet> stopWordSets = documentFeature.getStopWordSets();
	for(DocumentFeature.StopWordSet stopWordSet : stopWordSets){

		if(stopWordSet.isEmpty()){
			continue;
		}

		String tokenRE;

		String wordSeparatorRE = documentFeature.getWordSeparatorRE();
		switch(wordSeparatorRE){
			case "\\s+":
				tokenRE = "(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWordSet) + ")\\p{Punct}*(\\s+|$)";
				break;
			case "\\W+":
				tokenRE = "(\\W+)(" + JOINER.join(stopWordSet) + ")(\\W+)";
				break;
			default:
				throw new IllegalArgumentException("Expected \"\\s+\" or \"\\W+\" as splitter regex pattern, got \"" + wordSeparatorRE + "\"");
		}

		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList(tokenRE));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setCaseSensitive(stopWordSet.isCaseSensitive())
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	DefineFunction defineFunction = new DefineFunction("tf" + "@" + String.valueOf(CountVectorizerModelConverter.SEQUENCE.getAndIncrement()), OpType.CONTINUOUS, DataType.INTEGER, null, textIndex)
		.addParameterFields(documentField, termField);

	encoder.addDefineFunction(defineFunction);

	List<Feature> result = new ArrayList<>();

	String[] vocabulary = transformer.vocabulary();
	for(int i = 0; i < vocabulary.length; i++){
		String term = vocabulary[i];

		if(TermUtil.hasPunctuation(term)){
			throw new IllegalArgumentException("Punctuated vocabulary terms (" + term + ") are not supported");
		}

		result.add(new TermFeature(encoder, defineFunction, documentFeature, term));
	}

	return result;
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

static
public int termFrequency(TextIndex textIndex, List<String> textTokens, List<String> termTokens){

	if(textTokens.isEmpty() || termTokens.isEmpty()){
		return 0;
	}

	boolean caseSensitive = textIndex.isCaseSensitive();

	int maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();
	if(maxLevenshteinDistance < 0){
		throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
	}

	boolean bestHits;

	TextIndex.CountHits countHits = textIndex.getCountHits();
	switch(countHits){
		case BEST_HITS:
			bestHits = true;
			break;
		case ALL_HITS:
			bestHits = false;
			break;
		default:
			throw new UnsupportedAttributeException(textIndex, countHits);
	}

	int maxFrequency;

	TextIndex.LocalTermWeights localTermWeights = textIndex.getLocalTermWeights();
	switch(localTermWeights){
		case BINARY:
			maxFrequency = 1;
			break;
		case TERM_FREQUENCY:
		case LOGARITHMIC:
			maxFrequency = Integer.MAX_VALUE;
			break;
		default:
			throw new UnsupportedAttributeException(textIndex, localTermWeights);
	}

	try {
		return termFrequency(textTokens, termTokens, caseSensitive, maxLevenshteinDistance, bestHits, maxFrequency);
	} catch(PMMLException pe){
		throw pe.ensureContext(textIndex);
	}
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

TermProcessor(TextIndex textIndex, FieldValue value){
	super(textIndex, value);
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

TextProcessor(TextIndex textIndex, FieldValue value){
	super(textIndex, value);
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

private void setTextIndex(TextIndex textIndex){
	this.textIndex = textIndex;
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

public TextIndex getTextIndex(){
	return this.textIndex;
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

public StringProcessor(TextIndex textIndex, FieldValue value){
	setTextIndex(Objects.requireNonNull(textIndex));
	setValue(Objects.requireNonNull(value));
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

static
public String normalize(TextIndex textIndex, TextIndexNormalization textIndexNormalization, String string){
	TextTokenizer tokenizer = null;

	Boolean tokenize = textIndexNormalization.isTokenize();
	if(tokenize == null){
		tokenize = textIndex.isTokenize();
	} // End if

	if(tokenize){
		PMMLObject locatable = textIndexNormalization;

		String wordSeparatorCharacterRE = textIndexNormalization.getWordSeparatorCharacterRE();
		if(wordSeparatorCharacterRE == null){
			locatable = textIndex;

			wordSeparatorCharacterRE = textIndex.getWordSeparatorCharacterRE();
		}

		Pattern pattern = RegExUtil.compile(wordSeparatorCharacterRE, locatable);

		tokenizer = new TextTokenizer(pattern);
	}

	Boolean caseSensitive = textIndexNormalization.isCaseSensitive();
	if(caseSensitive == null){
		caseSensitive = textIndex.isCaseSensitive();
	}

	Integer maxLevenshteinDistance = textIndexNormalization.getMaxLevenshteinDistance();
	if(maxLevenshteinDistance == null){
		maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();

		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	} else

	{
		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndexNormalization, PMMLAttributes.TEXTINDEXNORMALIZATION_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	}

	InlineTable inlineTable = InlineTableUtil.getInlineTable(textIndexNormalization);
	if(inlineTable != null){
		String inField = textIndexNormalization.getInField();
		String outField = textIndexNormalization.getOutField();
		String regexField = textIndexNormalization.getRegexField();

		normalization:
		while(true){
			String normalizedString;

			try {
				normalizedString = normalize(inlineTable, inField, outField, regexField, string, tokenizer, caseSensitive, maxLevenshteinDistance);
			} catch(PMMLException pe){
				throw pe.ensureContext(textIndexNormalization);
			}

			// "If the recursive flag is set to true, then the normalization table is reapplied until none of its rows causes a change to the input text."
			if(textIndexNormalization.isRecursive()){

				if(!(normalizedString).equals(string)){
					string = normalizedString;

					continue normalization;
				}
			}

			return normalizedString;
		}
	}

	return string;
}

Source File: ExpressionUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

static
FieldValue evaluateExpression(Expression expression, EvaluationContext context){

	if(expression instanceof Constant){
		return evaluateConstant((Constant)expression);
	} else

	if(expression instanceof FieldRef){
		return evaluateFieldRef((FieldRef)expression, context);
	} else

	if(expression instanceof NormContinuous){
		return evaluateNormContinuous((NormContinuous)expression, context);
	} else

	if(expression instanceof NormDiscrete){
		return evaluateNormDiscrete((NormDiscrete)expression, context);
	} else

	if(expression instanceof Discretize){
		return evaluateDiscretize((Discretize)expression, context);
	} else

	if(expression instanceof MapValues){
		return evaluateMapValues((MapValues)expression, context);
	} else

	if(expression instanceof TextIndex){
		return evaluateTextIndex((TextIndex)expression, context);
	} else

	if(expression instanceof Apply){
		return evaluateApply((Apply)expression, context);
	} else

	if(expression instanceof Aggregate){
		return evaluateAggregate((Aggregate)expression, context);
	} // End if

	if(expression instanceof JavaExpression){
		return evaluateJavaExpression((JavaExpression)expression, context);
	}

	throw new UnsupportedElementException(expression);
}

Source File: FieldReferenceFinder.java From jpmml-model with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public VisitorAction visit(TextIndex textIndex){
	process(textIndex.getTextField());

	return super.visit(textIndex);
}

Source File: CountVectorizer.java From jpmml-sklearn with GNU Affero General Public License v3.0

4 votes

public DefineFunction encodeDefineFunction(){
	String analyzer = getAnalyzer();
	List<String> stopWords = getStopWords();
	Object[] nGramRange = getNGramRange();
	Boolean binary = getBinary();
	Object preprocessor = getPreprocessor();
	String stripAccents = getStripAccents();
	Splitter tokenizer = getTokenizer();

	switch(analyzer){
		case "word":
			break;
		default:
			throw new IllegalArgumentException(analyzer);
	}

	if(preprocessor != null){
		throw new IllegalArgumentException();
	} // End if

	if(stripAccents != null){
		throw new IllegalArgumentException(stripAccents);
	}

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(tokenizer.getSeparatorRE())
		.setLocalTermWeights(binary ? TextIndex.LocalTermWeights.BINARY : null);

	if((stopWords != null && stopWords.size() > 0) && !Arrays.equals(nGramRange, new Integer[]{1, 1})){
		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList("(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWords) + ")\\p{Punct}*(\\s+|$)"));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	String name = functionName() + "@" + String.valueOf(CountVectorizer.SEQUENCE.getAndIncrement());

	DefineFunction defineFunction = new DefineFunction(name, OpType.CONTINUOUS, DataType.DOUBLE, null, textIndex)
		.addParameterFields(documentField, termField);

	return defineFunction;
}

org.dmg.pmml.TextIndex Java Examples