Java Code Examples for org.grobid.core.layout.LayoutToken#getText()
The following examples show how to use
org.grobid.core.layout.LayoutToken#getText() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimilarityScorer.java From entity-fishing with Apache License 2.0 | 5 votes |
/** * Normalise LayoutTokens sequence as an array of words correspond to word embeddings */ private List<String> toStringEmbeddings(List<LayoutToken> tokens, String lang) { List<String> toks = new ArrayList<String>(); for(LayoutToken token : tokens) { String word = token.getText(); if (word == null || word.trim().length() == 0) continue; if (ProcessText.delimiters.indexOf(word) != -1) continue; // unicode normalization word = UnicodeUtil.normaliseText(word); // remove possible remaining punctuations word = word.replaceAll("\\p{P}", ""); // flatten numerical chars word = word.replaceAll("\\d", "0"); // lower case everything (to be evaluated!) word = word.toLowerCase(); word = word.replace("\t", ""); if (word.trim().length() == 0) continue; try { if (!Stopwords.getInstance().isStopword(word, lang)) toks.add(word); } catch(Exception e) { LOGGER.warn("Problem getting Stopwords instance", e); toks.add(word); } } return toks; }
Example 2
Source File: NerdRestProcessFile.java From entity-fishing with Apache License 2.0 | 5 votes |
private boolean needToLowerCase(List<LayoutToken> layoutTokens) { if (isAllUpperCase(LayoutTokensUtil.toText(layoutTokens))) { return true; } else { int count = 0; int total = 0; for (LayoutToken token : layoutTokens) { final String tokenText = token.getText(); if (!TextUtilities.fullPunctuations.contains(tokenText)) { total++; if (tokenText.length() == 1) { if (TextUtilities.isAllUpperCase(tokenText)) { count++; } } else if (tokenText.length() > 1) { if (Character.isUpperCase(tokenText.charAt(0)) && TextUtilities.isAllLowerCase(tokenText.substring(1, tokenText.length()))) { count++; } } } } if (count == total) { return true; } } return false; }
Example 3
Source File: NERParserCommon.java From grobid-ner with Apache License 2.0 | 5 votes |
public static String toFeatureVectorLayout(List<LayoutToken> tokens, LexiconPositionsIndexes positionsIndexes) { StringBuffer ress = new StringBuffer(); int posit = 0; // keep track of the position index in the list of positions for (LayoutToken token : tokens) { if ((token.getText() == null) || (token.getText().length() == 0) || token.getText().equals(" ") || token.getText().equals("\t") || token.getText().equals("\n") || token.getText().equals("\r") || token.getText().equals("\u00A0")) { continue; } // check if the token is a known NE // do we have a NE at position posit? boolean isLocationToken = LexiconPositionsIndexes .isTokenInLexicon(positionsIndexes.getLocalLocationPositions(), posit); boolean isPersonTitleToken = LexiconPositionsIndexes .isTokenInLexicon(positionsIndexes.getLocalPersonTitlePositions(), posit); boolean isOrganisationToken = LexiconPositionsIndexes .isTokenInLexicon(positionsIndexes.getLocalOrganisationPositions(), posit); boolean isOrgFormToken = LexiconPositionsIndexes .isTokenInLexicon(positionsIndexes.getLocalOrgFormPositions(), posit); ress.append(FeaturesVectorNER .addFeaturesNER(token.getText(), isLocationToken, isPersonTitleToken, isOrganisationToken, isOrgFormToken) .printVector()); ress.append("\n"); posit++; } ress.append("\n"); return ress.toString(); }
Example 4
Source File: ProcessText.java From entity-fishing with Apache License 2.0 | 4 votes |
public Map<Mention, Mention> acronymCandidates(List<LayoutToken> tokens) { Map<Mention, Mention> acronyms = null; // detect possible acronym boolean openParenthesis = false; int posParenthesis = -1; int i = 0; LayoutToken acronym = null; for (LayoutToken token : tokens) { if (token.getText() == null) { i++; continue; } if (token.getText().equals("(")) { openParenthesis = true; posParenthesis = i; acronym = null; } else if (token.getText().equals(")")) { openParenthesis = false; } else if (openParenthesis) { if (isAllUpperCaseOrDigitOrDot(token.getText())) { acronym = token; } else { acronym = null; } } if ((acronym != null) && (!openParenthesis)) { // check if this possible acronym matches an immediately preceeding term int j = posParenthesis; int k = acronym.getText().length(); boolean stop = false; while ((k > 0) && (!stop)) { k--; char c = acronym.getText().toLowerCase().charAt(k); while ((j > 0) && (!stop)) { j--; if (tokens.get(j) != null) { String tok = tokens.get(j).getText(); if (tok.trim().length() == 0 || delimiters.contains(tok)) continue; boolean numericMatch = false; if ((tok.length() > 1) && StringUtils.isNumeric(tok)) { //System.out.println("acronym: " + acronym.getText()); //System.out.println("tok: " + tok); // when the token is all digit, it often appears in full as such in the // acronym (e.g. GDF15) String acronymCurrentPrefix = acronym.getText().substring(0, k + 1); //System.out.println("acronymCurrentPrefix: " + acronymCurrentPrefix); if (acronymCurrentPrefix.endsWith(tok)) { // there is a full number match k = k - tok.length() + 1; numericMatch = true; //System.out.println("numericMatch is: " + numericMatch); } } if ((tok.toLowerCase().charAt(0) == c) || numericMatch) { if (k == 0) { if (acronyms == null) acronyms = new HashMap<>(); List<LayoutToken> baseTokens = new ArrayList<>(); StringBuilder builder = new StringBuilder(); for (int l = j; l < posParenthesis; l++) { builder.append(tokens.get(l)); baseTokens.add(tokens.get(l)); } Mention entityAcronym = new Mention(); entityAcronym.setRawName(acronym.getText()); entityAcronym.setNormalisedName(builder.toString().trim()); entityAcronym.setOffsetStart(acronym.getOffset()); entityAcronym.setOffsetEnd(acronym.getOffset() + acronym.getText().length()); entityAcronym.setType(null); entityAcronym.setIsAcronym(true); entityAcronym.setLayoutTokens(Arrays.asList(acronym)); Mention entityBase = new Mention(builder.toString().trim()); entityBase.setOffsetStart(tokens.get(j).getOffset()); entityBase.setOffsetEnd(tokens.get(j).getOffset() + entityBase.getRawName().length()); entityBase.setLayoutTokens(baseTokens); acronyms.put(entityAcronym, entityBase); stop = true; } else break; } else { stop = true; } } } } acronym = null; posParenthesis = -1; } i++; } return acronyms; }