edu.stanford.nlp.ling.Word Java Examples

The following examples show how to use edu.stanford.nlp.ling.Word. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: StanfordPTBTokenizer.java    From mateplus with GNU General Public License v2.0 6 votes vote down vote up
@Override
public String[] tokenize(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<String> l = new ArrayList<String>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(w.word());
	}
	String[] tok = new String[l.size() + 1];
	tok[0] = is2.io.CONLLReader09.ROOT;
	int i = 1;
	for (String s : l)
		tok[i++] = s;
	return tok;
}
 
Example #2
Source File: StanfordPTBTokenizer.java    From mateplus with GNU General Public License v2.0 6 votes vote down vote up
public StringInText[] tokenizeplus(String sentence) {
	Reader r = new StringReader(sentence);
	PTBTokenizer<Word> tokenizer = PTBTokenizer.newPTBTokenizer(r);
	List<StringInText> l = new ArrayList<StringInText>();
	while (tokenizer.hasNext()) {
		Word w = tokenizer.next();
		l.add(new StringInText(w.word(), w.beginPosition() + startpos, w
				.endPosition() + startpos));
	}
	StringInText[] tok = new StringInText[l.size() + 1];
	tok[0] = new StringInText(is2.io.CONLLReader09.ROOT, 0, 0);
	int i = 1;
	for (StringInText s : l)
		tok[i++] = s;

	startpos += (1 + sentence.length());

	return tok;
}
 
Example #3
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 6 votes vote down vote up
/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}
 
Example #4
Source File: SentenceProcessor.java    From ADW with GNU General Public License v3.0 6 votes vote down vote up
public List<WordLemmaTag> processSentence(String sentence, boolean isTokenized)
{
	final StanfordLemmatizer lemmatizer = StanfordLemmatizer.getInstance();
	final StanfordPOSTagger tagger = StanfordPOSTagger.getInstance();
   	final List<WordLemmaTag> tlSentence = new ArrayList<WordLemmaTag>();
	
   	// the tagged sentence
   	List<TaggedWord> tSentence = null;
   	if (isTokenized) tSentence = tagger.tag(sentence);
   	else
   	{
   		StanfordTokenizer tokenizer = StanfordTokenizer.getInstance();
   		List<Word> tokens = tokenizer.tokenize(sentence);
   		tSentence = tagger.tag(tokens);
   	}
   	
   	// add to the lemmatized sentence
   	for (TaggedWord tw : tSentence) 
   		tlSentence.add(lemmatizer.lemmatize(tw));

   	return tlSentence;
}
 
Example #5
Source File: StanfordTokenizer.java    From ADW with GNU General Public License v3.0 6 votes vote down vote up
public List<Word> tokenize(String string)
{ 
	this.tokenizer = 
		new PTBTokenizer<Word>(
				new StringReader(string), 
				new WordTokenFactory(), 
				"untokenizable=noneDelete,ptb3Escaping=true");
	try
	{
		return tokenizer.tokenize();
	}
	catch (Exception e)
	{
		System.err.println(e.getMessage());
		
		final List<Word> tokens = new ArrayList<Word>();
		for (String token : pennTokenizer.tokenize(string).split("\\s+"))
		{ 
			tokens.add(new Word(token));
		}
		return tokens;
	}
}
 
Example #6
Source File: StanfordParser.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
 
Example #7
Source File: StanfordPosTagger.java    From OpenEphyra with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}
 
Example #8
Source File: StanfordTokenizer.java    From ADW with GNU General Public License v3.0 5 votes vote down vote up
public List<String> tokenizeString(String string)
{ 
	final List<String> tokens = new ArrayList<String>();
	for (Word w : tokenize(string))
	{
		tokens.add(w.word());
	}
	return tokens;
}
 
Example #9
Source File: POSTagger.java    From EasySRL with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputWord> tag(final List<InputWord> input) {
	return tagger.tagSentence(input.stream().map(w -> new Word(w.word)).collect(Collectors.toList())).stream()
			.map(w -> new InputWord(w.word(), w.tag(), null)).collect(Collectors.toList());
}