edu.stanford.nlp.ling.HasWord Java Exaples

Source File: CoreNLPAnnotator.java From Stargraph with MIT License

6 votes

@Override
protected List<Word> doRun(Language language, String sentence) {
    MaxentTagger tagger = taggers.computeIfAbsent(language, lang -> {
        if (lang == EN) {
            return new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
        }
        throw new UnsupportedLanguageException(lang);
    });

    PartOfSpeechSet partOfSpeechSet = PartOfSpeechSet.getPOSSet(language);
    List<Word> words = new ArrayList<>();

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence));
    sentences.forEach(s -> {
        tagger.tagSentence(s).forEach(taggedWord ->
                words.add(new Word(partOfSpeechSet.valueOf(taggedWord.tag()), taggedWord.value())));
    });

    return words;
}

Source File: DocumentFrequencyCounter.java From wiseowl with MIT License

6 votes

/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}

Source File: CorenlpPipeline.java From datashare with GNU Affero General Public License v3.0

6 votes

/**
 * Part-of-Speech Classification (Maximum entropy) only
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 */
private Annotations processPosClassifier(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);
    LOGGER.info("POS-tagging for " + language.toString());

    // Split input into sentences
    final CoreNlpAnnotator<MaxentTagger> nlpAnnotator;
    nlpAnnotator = CoreNlpPosModels.getInstance().get(language);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(input));
    for (List<HasWord> sentence : sentences) {
        // NlpTag with parts-of-speech
        List<TaggedWord> taggedSentence = nlpAnnotator.annotator.tagSentence(sentence);
        // Feed annotatopn
        for (TaggedWord word : taggedSentence) {
            int begin = word.beginPosition();
            int end = word.endPosition();
            String pos = word.tag(); // like line 157 we don't use POS tagging
            annotations.add(POS, begin, end);
        }
    }
    return annotations;
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Splits the sentence into individual tokens.
 * 
 * @param sentence Input sentence
 * @return Array of tokens
 */
public static String[] tokenize(String sentence) {
	List t = MaxentTagger.tokenizeText(new StringReader(sentence));
	
	List<String> tokens = new ArrayList<String>();
	
	for (int j = 0; j < t.size(); j++) {
		Sentence s1 = (Sentence) t.get(j);
		
		for (int i = 0; i < s1.length(); i++) {
			HasWord w = s1.getHasWord(i);
			tokens.add(w.word());
		}
	}
	
	return (String[]) tokens.toArray(new String[tokens.size()]);
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Tags the tokens with part of speech
 * 
 * @param tokens Array of token strings
 * @return Part of speech tags
 */
public static String[] tagPos(String[] tokens) {
	Sentence untagged = createSentence(tokens);
	Sentence tagged = MaxentTagger.tagSentence(untagged);
	
	String[] pos = new String[tagged.size()];
	for (int i = 0; i < tagged.size(); i++) {
		HasWord w = (HasWord) tagged.get(i);
		String[] s = w.toString().split("/");
		if (s.length > 1)
			pos[i] = s[s.length - 1];
		else
			pos[i] = "";
	}
	
	return pos;
}

Source File: TokenizerDemo.java From blog-codes with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
	for (String arg : args) {
		// option #1: By sentence.
		DocumentPreprocessor dp = new DocumentPreprocessor(arg);
		for (List<HasWord> sentence : dp) {
			System.out.println(sentence);
		}
		// option #2: By token
		PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
		while (ptbt.hasNext()) {
			CoreLabel label = ptbt.next();
			System.out.println(label);
		}
	}
}

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingStanfordMaxentPOS() {
        try {
            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//wsj-0-18-bidirectional-distsim.tagger");
//            MaxentTagger tagger = new MaxentTagger(getModelDir() + "//gate-EN-twitter.model");
//            System.out.println(tagger.tagString("AFAIK she H8 cth!"));
//            System.out.println(tagger.tagString("BTW had a GR8 tym at the party BBIAM."));
            List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader("sentences.txt")));
            for (List<HasWord> sentence : sentences) {
                List<TaggedWord> taggedSentence = tagger.tagSentence(sentence);
                // Simple display
                System.out.println("---" + taggedSentence);
                // Simple conversion to String
//                System.out.println(Sentence.listToString(taggedSentence, false));
                // Display of words and tags
//                for (TaggedWord taggedWord : taggedSentence) {
//                    System.out.print(taggedWord.word() + "/" + taggedWord.tag() + " ");
//                }
//                System.out.println();
                // List of specifc tags
//                System.out.print("NN Tagged: ");
//                for (TaggedWord taggedWord : taggedSentence) {
//                    if (taggedWord.tag().startsWith("NN")) {
//                        System.out.print(taggedWord.word() + " ");
//                    }
//                }
//                System.out.println();
            }
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        }
    }

Source File: CoreNLP.java From Criteria2Query with Apache License 2.0

5 votes

public List<String> splitParagraph(String paragraph){
	Reader reader = new StringReader(paragraph);
	DocumentPreprocessor dp = new DocumentPreprocessor(reader);
	List<String> sentenceList = new ArrayList<String>();
	for (List<HasWord> sentence : dp) {
		String sentenceString = SentenceUtils.listToString(sentence);
		sentenceList.add(sentenceString);
	}
	return sentenceList;
}

Source File: StanfordPOSTagger.java From jatecs with GNU General Public License v3.0

5 votes

public Vector<ArrayList<TaggedWord>> tag(String input) {
    Vector<ArrayList<TaggedWord>> returnVector = new Vector<ArrayList<TaggedWord>>();
    List<List<HasWord>> sentences = MaxentTagger
            .tokenizeText(new BufferedReader(new StringReader(input)));
    for (List<? extends HasWord> sentence : sentences) {
        returnVector.add(tagger.tagSentence(sentence));
    }
    return returnVector;
}

Source File: ParseTree.java From NLIDB with Apache License 2.0

5 votes

/**
 * Construct a parse tree using the stanford NLP parser. Only one sentence.
 * Here we are omitting the information of dependency labels (tags).
 * @param text input text.
 */
public ParseTree(String text, NLParser parser) {
	// pre-processing the input text
	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	List<HasWord> sentence = null;
	for (List<HasWord> sentenceHasWord : tokenizer) {
		sentence = sentenceHasWord;
		break;
	}
	// part-of-speech tagging
	List<TaggedWord> tagged = parser.tagger.tagSentence(sentence);
	// dependency syntax parsing
	GrammaticalStructure gs = parser.parser.predict(tagged);
	
	// Reading the parsed sentence into ParseTree
	int N = sentence.size()+1;
	Node[] nodes = new Node[N];
	root = new Node(0, "ROOT", "ROOT");
	nodes[0] = root;
	for (int i = 0; i < N-1; i++) {
		nodes[i+1] = new Node(i+1, 
				sentence.get(i).word(), tagged.get(i).tag());
	}
	for (TypedDependency typedDep : gs.allTypedDependencies()) {
		int from = typedDep.gov().index();
		int to   = typedDep.dep().index();
		// String label = typedDep.reln().getShortName(); // omitting the label
		nodes[to].parent = nodes[from];
		nodes[from].children.add(nodes[to]);
	}
}

Source File: ParserDemo.java From NLIDB with Apache License 2.0

5 votes

public static void main(String[] args) {
	String modelPath = DependencyParser.DEFAULT_MODEL;
	String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

	for (int argIndex = 0; argIndex < args.length;) {
		switch (args[argIndex]) {
		case "-tagger":
			taggerPath = args[argIndex + 1];
			argIndex += 2;
			break;
		case "-com.dukenlidb.nlidb.model":
			modelPath = args[argIndex + 1];
			argIndex += 2;
			break;
		default:
			throw new RuntimeException("Unknown argument " + args[argIndex]);
		}
	}

	String text = "Return authors who have more papers than Bob in VLDB after 2000";

	MaxentTagger tagger = new MaxentTagger(taggerPath);
	DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	for (List<HasWord> sentence : tokenizer) {
		List<TaggedWord> tagged = tagger.tagSentence(sentence);
		GrammaticalStructure gs = parser.predict(tagged);

		// Print typed dependencies
		log.info(gs);
	}
	
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

5 votes

/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}

Source File: StanfordPOSTagger.java From ADW with GNU General Public License v3.0

5 votes

public List<TaggedWord> tag(List<? extends HasWord> sentence)
{
	if(sentence == null || sentence.size() == 0)
		return new ArrayList<TaggedWord>();
	
	return tagger.tagSentence(sentence);
}

Source File: TaggerDemo.java From blog-codes with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception { 
	InputStream input = TaggerDemo.class.getResourceAsStream("/"+MaxentTagger.DEFAULT_JAR_PATH);

	MaxentTagger tagger = new MaxentTagger(input);
	
	List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader("Karma of humans is AI"));

	for (List<HasWord> sentence : sentences) {

		List<TaggedWord> tSentence = tagger.tagSentence(sentence);

		System.out.println(SentenceUtils.listToString(tSentence, false));

	}

}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingTheStanfordTokenizer() {

        // Using PTBTokenizer
        System.out.println("----PTBTokenizer Example");

        // First example
//        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
//                new CoreLabelTokenFactory(),null);
//        while (ptb.hasNext()) {
//            System.out.println(ptb.next());
//        }
        // CoreLabel example
        CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
                ctf, "invertible=true");
//        PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
//                new WordTokenFactory(), null);
        while (ptb.hasNext()) {
            CoreLabel cl = (CoreLabel) ptb.next();
            System.out.println(cl.originalText() + " ("
                    + cl.beginPosition() + "-" + cl.endPosition() + ")");
        }

        // Using a DocumentPreprocessor
        System.out.println("----DocumentPreprocessor Example");
        Reader reader = new StringReader(paragraph);
        DocumentPreprocessor documentPreprocessor
                = new DocumentPreprocessor(reader);

        Iterator<List<HasWord>> it = documentPreprocessor.iterator();
        while (it.hasNext()) {
            List<HasWord> sentence = it.next();
            for (HasWord token : sentence) {
                System.out.println(token);
            }
        }

//        for (List<HasWord> sentence : documentPreprocessor) {
////            List<HasWord> sentence = it.next();
//            for (HasWord token : sentence) {
//                System.out.println(token);
//            }
//        }
        // Using a pipeline
        System.out.println("----pipeline Example");
        Properties properties = new Properties();
        properties.put("annotators", "tokenize, ssplit");

        StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
        Annotation annotation = new Annotation(paragraph);

        pipeline.annotate(annotation);
        pipeline.prettyPrint(annotation, System.out);

    }

Source File: StanfordPOSTagger.java From ADW with GNU General Public License v3.0

4 votes

public List<TaggedWord> tag(String sentence)
{
	List<HasWord> tokens = Sentence.toWordList(sentence.split("\\s+"));
	return tag(tokens);
}

edu.stanford.nlp.ling.HasWord Java Examples