org.dmg.pmml.TextIndexNormalization Java Exaples

Source File: CountVectorizerModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0

5 votes

@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	CountVectorizerModel transformer = getTransformer();

	DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol());

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(documentFeature.getWordSeparatorRE())
		.setLocalTermWeights(transformer.getBinary() ? TextIndex.LocalTermWeights.BINARY : null);

	Set<DocumentFeature.StopWordSet> stopWordSets = documentFeature.getStopWordSets();
	for(DocumentFeature.StopWordSet stopWordSet : stopWordSets){

		if(stopWordSet.isEmpty()){
			continue;
		}

		String tokenRE;

		String wordSeparatorRE = documentFeature.getWordSeparatorRE();
		switch(wordSeparatorRE){
			case "\\s+":
				tokenRE = "(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWordSet) + ")\\p{Punct}*(\\s+|$)";
				break;
			case "\\W+":
				tokenRE = "(\\W+)(" + JOINER.join(stopWordSet) + ")(\\W+)";
				break;
			default:
				throw new IllegalArgumentException("Expected \"\\s+\" or \"\\W+\" as splitter regex pattern, got \"" + wordSeparatorRE + "\"");
		}

		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList(tokenRE));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setCaseSensitive(stopWordSet.isCaseSensitive())
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	DefineFunction defineFunction = new DefineFunction("tf" + "@" + String.valueOf(CountVectorizerModelConverter.SEQUENCE.getAndIncrement()), OpType.CONTINUOUS, DataType.INTEGER, null, textIndex)
		.addParameterFields(documentField, termField);

	encoder.addDefineFunction(defineFunction);

	List<Feature> result = new ArrayList<>();

	String[] vocabulary = transformer.vocabulary();
	for(int i = 0; i < vocabulary.length; i++){
		String term = vocabulary[i];

		if(TermUtil.hasPunctuation(term)){
			throw new IllegalArgumentException("Punctuated vocabulary terms (" + term + ") are not supported");
		}

		result.add(new TermFeature(encoder, defineFunction, documentFeature, term));
	}

	return result;
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

5 votes

static
public String normalize(TextIndex textIndex, String string){

	if(textIndex.hasTextIndexNormalizations()){
		List<TextIndexNormalization> textIndexNormalizations = textIndex.getTextIndexNormalizations();

		for(TextIndexNormalization textIndexNormalization : textIndexNormalizations){
			string = TextUtil.normalize(textIndex, textIndexNormalization, string);
		}
	}

	return string;
}

Source File: ExpressionUtilTest.java From jpmml-evaluator with GNU Affero General Public License v3.0

5 votes

@Test
public void evaluateTextIndexNormalization(){
	FieldName name = FieldName.create("x");

	TextIndexNormalization stepOne = new TextIndexNormalization();

	List<List<String>> cells = Arrays.asList(
		Arrays.asList("interfaces?", "interface", "true"),
		Arrays.asList("is|are|seem(ed|s?)|were", "be", "true"),
		Arrays.asList("user friendl(y|iness)", "user_friendly", "true")
	);

	stepOne.setInlineTable(createInlineTable(cells, stepOne));

	TextIndexNormalization stepTwo = new TextIndexNormalization()
		.setInField("re")
		.setOutField("feature");

	cells = Arrays.asList(
		Arrays.asList("interface be (user_friendly|well designed|excellent)", "ui_good", "true")
	);

	stepTwo.setInlineTable(createInlineTable(cells, stepTwo));

	TextIndex textIndex = new TextIndex(name, new Constant("ui_good"))
		.setLocalTermWeights(TextIndex.LocalTermWeights.BINARY)
		.setCaseSensitive(false)
		.addTextIndexNormalizations(stepOne, stepTwo);

	assertEquals(1, evaluate(textIndex, name, "Testing the app for a few days convinced me the interfaces are excellent!"));
}

Source File: CountVectorizer.java From jpmml-sklearn with GNU Affero General Public License v3.0

4 votes

public DefineFunction encodeDefineFunction(){
	String analyzer = getAnalyzer();
	List<String> stopWords = getStopWords();
	Object[] nGramRange = getNGramRange();
	Boolean binary = getBinary();
	Object preprocessor = getPreprocessor();
	String stripAccents = getStripAccents();
	Splitter tokenizer = getTokenizer();

	switch(analyzer){
		case "word":
			break;
		default:
			throw new IllegalArgumentException(analyzer);
	}

	if(preprocessor != null){
		throw new IllegalArgumentException();
	} // End if

	if(stripAccents != null){
		throw new IllegalArgumentException(stripAccents);
	}

	ParameterField documentField = new ParameterField(FieldName.create("document"));

	ParameterField termField = new ParameterField(FieldName.create("term"));

	TextIndex textIndex = new TextIndex(documentField.getName(), new FieldRef(termField.getName()))
		.setTokenize(Boolean.TRUE)
		.setWordSeparatorCharacterRE(tokenizer.getSeparatorRE())
		.setLocalTermWeights(binary ? TextIndex.LocalTermWeights.BINARY : null);

	if((stopWords != null && stopWords.size() > 0) && !Arrays.equals(nGramRange, new Integer[]{1, 1})){
		Map<String, List<String>> data = new LinkedHashMap<>();
		data.put("string", Collections.singletonList("(^|\\s+)\\p{Punct}*(" + JOINER.join(stopWords) + ")\\p{Punct}*(\\s+|$)"));
		data.put("stem", Collections.singletonList(" "));
		data.put("regex", Collections.singletonList("true"));

		TextIndexNormalization textIndexNormalization = new TextIndexNormalization(null, PMMLUtil.createInlineTable(data))
			.setRecursive(Boolean.TRUE); // Handles consecutive matches. See http://stackoverflow.com/a/25085385

		textIndex.addTextIndexNormalizations(textIndexNormalization);
	}

	String name = functionName() + "@" + String.valueOf(CountVectorizer.SEQUENCE.getAndIncrement());

	DefineFunction defineFunction = new DefineFunction(name, OpType.CONTINUOUS, DataType.DOUBLE, null, textIndex)
		.addParameterFields(documentField, termField);

	return defineFunction;
}

Source File: TextUtil.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

static
public String normalize(TextIndex textIndex, TextIndexNormalization textIndexNormalization, String string){
	TextTokenizer tokenizer = null;

	Boolean tokenize = textIndexNormalization.isTokenize();
	if(tokenize == null){
		tokenize = textIndex.isTokenize();
	} // End if

	if(tokenize){
		PMMLObject locatable = textIndexNormalization;

		String wordSeparatorCharacterRE = textIndexNormalization.getWordSeparatorCharacterRE();
		if(wordSeparatorCharacterRE == null){
			locatable = textIndex;

			wordSeparatorCharacterRE = textIndex.getWordSeparatorCharacterRE();
		}

		Pattern pattern = RegExUtil.compile(wordSeparatorCharacterRE, locatable);

		tokenizer = new TextTokenizer(pattern);
	}

	Boolean caseSensitive = textIndexNormalization.isCaseSensitive();
	if(caseSensitive == null){
		caseSensitive = textIndex.isCaseSensitive();
	}

	Integer maxLevenshteinDistance = textIndexNormalization.getMaxLevenshteinDistance();
	if(maxLevenshteinDistance == null){
		maxLevenshteinDistance = textIndex.getMaxLevenshteinDistance();

		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndex, PMMLAttributes.TEXTINDEX_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	} else

	{
		if(maxLevenshteinDistance < 0){
			throw new InvalidAttributeException(textIndexNormalization, PMMLAttributes.TEXTINDEXNORMALIZATION_MAXLEVENSHTEINDISTANCE, maxLevenshteinDistance);
		}
	}

	InlineTable inlineTable = InlineTableUtil.getInlineTable(textIndexNormalization);
	if(inlineTable != null){
		String inField = textIndexNormalization.getInField();
		String outField = textIndexNormalization.getOutField();
		String regexField = textIndexNormalization.getRegexField();

		normalization:
		while(true){
			String normalizedString;

			try {
				normalizedString = normalize(inlineTable, inField, outField, regexField, string, tokenizer, caseSensitive, maxLevenshteinDistance);
			} catch(PMMLException pe){
				throw pe.ensureContext(textIndexNormalization);
			}

			// "If the recursive flag is set to true, then the normalization table is reapplied until none of its rows causes a change to the input text."
			if(textIndexNormalization.isRecursive()){

				if(!(normalizedString).equals(string)){
					string = normalizedString;

					continue normalization;
				}
			}

			return normalizedString;
		}
	}

	return string;
}

Source File: ExpressionUtilTest.java From jpmml-evaluator with GNU Affero General Public License v3.0

4 votes

static
InlineTable createInlineTable(List<List<String>> rows, TextIndexNormalization textIndexNormalization){
	return createInlineTable(rows, Arrays.asList(textIndexNormalization.getInField(), textIndexNormalization.getOutField(), textIndexNormalization.getRegexField()));
}

org.dmg.pmml.TextIndexNormalization Java Examples