edu.stanford.nlp.ling.Sentence Java Exaples

Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Splits the sentence into individual tokens.
 * 
 * @param sentence Input sentence
 * @return Array of tokens
 */
public static String[] tokenize(String sentence) {
	List t = MaxentTagger.tokenizeText(new StringReader(sentence));
	
	List<String> tokens = new ArrayList<String>();
	
	for (int j = 0; j < t.size(); j++) {
		Sentence s1 = (Sentence) t.get(j);
		
		for (int i = 0; i < s1.length(); i++) {
			HasWord w = s1.getHasWord(i);
			tokens.add(w.word());
		}
	}
	
	return (String[]) tokens.toArray(new String[tokens.size()]);
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Tags the tokens with part of speech
 * 
 * @param tokens Array of token strings
 * @return Part of speech tags
 */
public static String[] tagPos(String[] tokens) {
	Sentence untagged = createSentence(tokens);
	Sentence tagged = MaxentTagger.tagSentence(untagged);
	
	String[] pos = new String[tagged.size()];
	for (int i = 0; i < tagged.size(); i++) {
		HasWord w = (HasWord) tagged.get(i);
		String[] s = w.toString().split("/");
		if (s.length > 1)
			pos[i] = s[s.length - 1];
		else
			pos[i] = "";
	}
	
	return pos;
}

Source File: CoverageChecker.java From phrasal with GNU General Public License v3.0

5 votes

static public void countNgrams(String line, Counter<String> ngramCounts, Set<String> limitSet, int order) {
   String[] toks = line.split("\\s");
   for (int i = 0; i < toks.length; i++) {
      for (int j = 0; j < order && j+i < toks.length ; j++) {
         String[] ngramArr = Arrays.copyOfRange(toks, i, i+j+1);
         String ngram = Sentence.listToString(Arrays.asList(ngramArr));
         if (limitSet == null || limitSet.contains(ngram)) {
            ngramCounts.incrementCount(ngram);
         }
      }
   }	   
}

Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0

5 votes

/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}

Source File: StanfordPosTagger.java From OpenEphyra with GNU General Public License v2.0

5 votes

/**
 * Combines the tokens into a <code>Sentence</code> 
 * 
 * @param tokens
 * @return <code>Sentence</code> made of the tokens
 */
@SuppressWarnings("unchecked")
private static Sentence createSentence(String[] tokens) {
	ArrayList<HasWord> wordList = new ArrayList<HasWord>();
	
	for (String s : tokens) {
		HasWord w = new Word(s);
		wordList.add(w);
	}
	
	Sentence sentence = new Sentence();
	sentence.setWords(wordList);
	
	return sentence;
}

Source File: POSTagger.java From JHazm with MIT License

5 votes

public List<TaggedWord> batchTag(List<String> sentence) {
    String[] sen = new String[sentence.size()];
    for (int i = 0; i < sentence.size(); i++)
       sen[i] = sentence.get(i).replace(" ", "_");
    List newSent = Sentence.toWordList(sen);
    List taggedSentence = this.tagger.tagSentence(newSent);

    List<TaggedWord> taggedSen = new ArrayList<>();
    for (int i = 0; i < taggedSentence.size(); i++) {
        TaggedWord tw = (TaggedWord)taggedSentence.get(i);
        tw.setWord(sentence.get(i));
        taggedSen.add(tw);
    }
    return taggedSen;
}

Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingStanfordLexicalizedParser() {
        String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);

        // This option shows parsing a list of correctly tokenized words
        System.out.println("---First option");
        String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
        List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);

        Tree parseTree = lexicalizedParser.apply(words);
        parseTree.pennPrint();
        System.out.println();

        // This option shows loading and using an explicit tokenizer
        System.out.println("---Second option");
        String sentence = "The cow jumped over the moon.";
        TokenizerFactory<CoreLabel> tokenizerFactory
                = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        Tokenizer<CoreLabel> tokenizer
                = tokenizerFactory.getTokenizer(new StringReader(sentence));
        List<CoreLabel> wordList = tokenizer.tokenize();
        parseTree = lexicalizedParser.apply(wordList);

        TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        for (TypedDependency dependency : tdl) {
            System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
                    + "] Dependent Word: [" + dependency.dep() + "]");
        }
        System.out.println();

        // You can also use a TreePrint object to print trees and dependencies
//        System.out.println("---Using TreePrint");
//        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
//        treePrint.printTree(parseTree);
//        System.out.println("TreePrint Formats");
//        for (String format : TreePrint.outputTreeFormats) {
//            System.out.println(format);
//        }
//        System.out.println();
    }

Source File: CRFPostprocessor.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Evaluate the postprocessor given an input file specified in the flags.
 * 
 * @param preProcessor
 * @param pwOut
 */
protected void evaluate(Preprocessor preProcessor, PrintWriter pwOut) {
  System.err.println("Starting evaluation...");
  DocumentReaderAndWriter<CoreLabel> docReader = new ProcessorTools.PostprocessorDocumentReaderAndWriter(preProcessor);
  ObjectBank<List<CoreLabel>> lines =
    classifier.makeObjectBankFromFile(flags.testFile, docReader);

  Counter<String> labelTotal = new ClassicCounter<String>();
  Counter<String> labelCorrect = new ClassicCounter<String>();
  int total = 0;
  int correct = 0;
  PrintWriter pw = new PrintWriter(IOTools.getWriterFromFile("apply.out"));
  for (List<CoreLabel> line : lines) {
    line = classifier.classify(line);
    pw.println(Sentence.listToString(ProcessorTools.toPostProcessedSequence(line)));
    total += line.size();
    for (CoreLabel label : line) {
      String hypothesis = label.get(CoreAnnotations.AnswerAnnotation.class);
      String reference = label.get(CoreAnnotations.GoldAnswerAnnotation.class);
      labelTotal.incrementCount(reference);
      if (hypothesis.equals(reference)) {
        correct++;
        labelCorrect.incrementCount(reference);
      }
    }
  }
  pw.close();

  double accuracy = ((double) correct) / ((double) total);
  accuracy *= 100.0;

  pwOut.println("EVALUATION RESULTS");
  pwOut.printf("#datums:\t%d%n", total);
  pwOut.printf("#correct:\t%d%n", correct);
  pwOut.printf("accuracy:\t%.2f%n", accuracy);
  pwOut.println("==================");

  // Output the per label accuracies
  pwOut.println("PER LABEL ACCURACIES");
  for (String refLabel : labelTotal.keySet()) {
    double nTotal = labelTotal.getCount(refLabel);
    double nCorrect = labelCorrect.getCount(refLabel);
    double acc = (nCorrect / nTotal) * 100.0;
    pwOut.printf(" %s\t%.2f%n", refLabel, acc);
  }
}

Source File: CRFPostprocessor.java From phrasal with GNU General Public License v3.0

4 votes

/**
 * Decode raw text input.
 * 
 * @param postProcessor
 * @param reader
 * @param outstream
 * @param nThreads
 * @return
 */
protected static double decode(final CRFPostprocessor postProcessor,
    BufferedReader reader, PrintWriter outstream, int nThreads) {
  long numChars = 0;
  int lineNumber = 0;
  long startTime = System.nanoTime();
  try {
    // Setup the threadpool
    MulticoreWrapper<String,String> wrapper = 
        new MulticoreWrapper<String,String>(nThreads, 
            new ThreadsafeProcessor<String,String>() {
              @Override
              public String process(String input) {
                List<CoreLabel> labeledSeq = ProcessorTools.toCharacterSequence(input);
                labeledSeq = postProcessor.classifier.classify(labeledSeq);
                List<CoreLabel> tokenSeq = ProcessorTools.toPostProcessedSequence(labeledSeq);
                return Sentence.listToString(tokenSeq);
              }
              @Override
              public ThreadsafeProcessor<String, String> newInstance() {
                return this;
              }
    });
    
    // Read the input
    for (String line; (line = reader.readLine()) != null; ++lineNumber) {
      numChars += line.length();
      wrapper.put(line.trim());
      while(wrapper.peek()) outstream.println(wrapper.poll());
    }
    
    wrapper.join();
    while(wrapper.peek()) outstream.println(wrapper.poll());
    
  } catch (IOException e) {
    System.err.printf("%s: Error at input line %d%s", CRFPostprocessor.class.getName(), lineNumber);
    e.printStackTrace();
  }
  // Calculate throughput
  double elapsedTime = ((double) System.nanoTime() - startTime) / 1e9;
  double charsPerSecond = (double) numChars / elapsedTime;
  return charsPerSecond;
}

Source File: RuleQuery.java From phrasal with GNU General Public License v3.0

4 votes

@Override
public String toString() {
  return String.format("%s (%s) %.5f", Sentence.listToString(tgt), Sentence.listToString(align), score);
}

Source File: TranslationQuery.java From phrasal with GNU General Public License v3.0

4 votes

@Override
public String toString() {
  return String.format("%s (%s) %.5f", Sentence.listToString(tgt), Sentence.listToString(align), score);
}

Source File: MakePTMPhrasalInput.java From phrasal with GNU General Public License v3.0

4 votes

/**
   * @param args
   * @throws IOException 
   */
  public static void main(String[] args) throws IOException {
    if (args.length != 3) {
      System.err.print(usage());
      System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argDefs());
    String[] positionalArgs = options.getProperty("").split("\\s+");

    String srcLang = positionalArgs[0];
    String tgtLang = positionalArgs[1];
    String sqlFile = positionalArgs[2];
    
    Preprocessor srcPreproc = ProcessorFactory.getPreprocessor(srcLang);
    Preprocessor tgtPreproc = ProcessorFactory.getPreprocessor(tgtLang);
    
    System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", "doc_id", "seg_id", "username", "mt_tok", "user_tok", "s2mt_tok", "src_tok");
//    CSVReader reader = new CSVReader(new FileReader(sqlFile));
    // Skip header
    boolean seenHeader = false;
//    for (String[] fields; (fields = reader.readNext()) != null;) {
  for (String[] fields = null;;) {
      if ( ! seenHeader) {
        seenHeader = true;
        continue;
      }
//      String segId = String.format("%s:%s", fields[0], fields[1]).replace(".src.json", ".tgt");
      String tgtLine = fields[3].trim();
      String alignStr = extend(fields[5]).trim();
      String srcLine = fields[6].trim();
      SymmetricalWordAlignment s2t = new SymmetricalWordAlignment(srcLine, tgtLine, alignStr);
      SymmetricalWordAlignment s2sPrime = srcPreproc.processAndAlign(srcLine);
      SymmetricalWordAlignment t2tPrime = tgtPreproc.processAndAlign(tgtLine);
      String userTextTok = tgtPreproc.process(fields[3]).toString();
      
      // Want sprime --> tprime
      List<String> alignmentList = new LinkedList<>();
      for (int i = 0, size = s2sPrime.eSize(); i < size; ++i) {
        Set<Integer> alignments = s2sPrime.e2f(i);
        for (int j : alignments) {
          Set<Integer> alignments2 = s2t.f2e(j);
          for (int k : alignments2) {
            Set<Integer> alignments3 = t2tPrime.f2e(k);
            for (int q : alignments3) {
              alignmentList.add(String.format("%d-%d",i,q));
            }
          }
        }
      }
      System.out.printf("%s\t%s\t%s\t%s\t%s\t%s\t%s%n", fields[0], fields[1], fields[2], t2tPrime.e().toString(), userTextTok, Sentence.listToString(alignmentList), s2sPrime.e().toString());
    }
//    reader.close();
  }

Source File: StanfordPOSTagger.java From ADW with GNU General Public License v3.0

4 votes

public List<TaggedWord> tag(String sentence)
{
	List<HasWord> tokens = Sentence.toWordList(sentence.split("\\s+"));
	return tag(tokens);
}

Source File: AbstractWordClassMap.java From phrasal with GNU General Public License v3.0

2 votes

/**
 * Map the input word to a word class.
 * 
 * @param word
 * @return
 */
public IString get(IString word) {
  List<IString> classList = getList(word);
  return numMappings == 1 ? classList.get(0) : new IString(Sentence.listToString(classList, true, DELIMITER));
}

edu.stanford.nlp.ling.Sentence Java Examples