Java Code Examples for it.unimi.dsi.fastutil.objects.Object2IntMap#getInt()
The following examples show how to use
it.unimi.dsi.fastutil.objects.Object2IntMap#getInt() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BestAnchors.java From tagme with Apache License 2.0 | 5 votes |
String findBest(int wid, final Object2IntMap<String> anchors) throws IOException { Query q = new TermQuery(new Term(WikipediaIndexer.FIELD_WID, ""+wid)); TopDocs td = articles.search(q, 1); if (td.totalHits == 0) return null;//throw new IOException("Unable to find title for WID:"+wid); String title = articles.doc(td.scoreDocs[0].doc).get(WikipediaIndexer.FIELD_TITLE); title = title.replaceAll("\\"", "\""); Set<String> titleTerms = terms(title).keySet(); List<String> bests = new ArrayList<String>(anchors.size()); bests.addAll(anchors.keySet()); Collections.sort(bests, new Comparator<String>() { @Override public int compare(String o1, String o2) { return anchors.getInt(o2)-anchors.getInt(o1); } }); for (String a : bests) { if (anchors.getInt(a)< MIN_ANCHORS) continue; Set<String> anchorTerms = terms(a).keySet(); for(String aw : anchorTerms) if (!titleTerms.contains(aw)) return a; } return null; }
Example 2
Source File: RedirectMap.java From tagme with Apache License 2.0 | 5 votes |
@Override protected Int2IntMap parseSet() throws IOException { final Object2IntMap<String> titles = new TitlesToWIDMap(lang).getDataset(); final Int2IntOpenHashMap map = new Int2IntOpenHashMap(3000000); SQLWikiParser parser = new SQLWikiParser(log, "Titles NF") { @Override public boolean compute(ArrayList<String> values) throws IOException { int ns = Integer.parseInt(values.get(SQLWikiParser.REDIRECT_NS)); if (ns == SQLWikiParser.NS_ARTICLE) { int idFrom = Integer.parseInt(values.get(SQLWikiParser.REDIRECT_ID_FROM)); int idTo = titles.getInt(cleanPageName(values.get(SQLWikiParser.REDIRECT_TITLE_TO))); if (idTo >= 0) map.put(idFrom, idTo); else this.updateItem(0); return true; } else return false; } }; File input = WikipediaFiles.REDIRECTS.getSourceFile(lang); InputStreamReader in = new InputStreamReader(new FileInputStream(input), Charset.forName("UTF-8")); parser.compute(in); in.close(); map.defaultReturnValue(-1); map.trim(); return map; }
Example 3
Source File: TweetCentroid.java From AffectiveTweets with GNU General Public License v3.0 | 5 votes |
/** * Adds a new document to the word representation. * @param docVector a document vector */ public void addDoc(Object2IntMap<String> docVector){ this.numDoc++; for(String vecWord:docVector.keySet()){ int vecWordFreq=docVector.getInt(vecWord); // if the word was seen before we add the current frequency this.wordSpace.put(vecWord,vecWordFreq+this.wordSpace.getInt(vecWord)); } }
Example 4
Source File: NoDictionarySingleColumnGroupKeyGenerator.java From incubator-pinot with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") private int getKeyForValue(String value) { Object2IntMap<String> map = (Object2IntMap<String>) _groupKeyMap; int groupId = map.getInt(value); if (groupId == INVALID_ID) { if (_numGroups < _globalGroupIdUpperBound) { groupId = _numGroups; map.put(value, _numGroups++); } } return groupId; }
Example 5
Source File: LanguageDetector.java From jstarcraft-nlp with Apache License 2.0 | 4 votes |
/** * 检测语言 * * @param text * @param options * @return */ public SortedSet<DetectionLanguage> detectLanguages(String text, Object2BooleanMap<String> options) { SortedSet<DetectionLanguage> locales = new TreeSet<>(); // 最小长度限制 int size = text.length(); if (size < minimum) { return locales; } // 最大长度限制 if (size > maximum) { text = text.substring(0, maximum); size = maximum; } // 白名单,黑名单 Set<String> writes = options.size() == 0 ? Collections.EMPTY_SET : new HashSet<>(); Set<String> blacks = options.size() == 0 ? Collections.EMPTY_SET : new HashSet<>(); for (Object2BooleanMap.Entry<String> option : options.object2BooleanEntrySet()) { if (option.getBooleanValue()) { writes.add(option.getKey()); } else { blacks.add(option.getKey()); } } /* * Get the script which characters occur the most in `value`. */ int count = -1; String script = null; for (DetectionPattern regulation : patterns.values()) { Pattern pattern = regulation.getPattern(); Matcher matcher = pattern.matcher(text); int match = 0; while (matcher.find()) { match++; } if (match > count) { count = match; script = regulation.getName(); } } if (script == null || count <= 0) { return locales; } /* One languages exists for the most-used script. */ Set<DetectionTrie> dictionaries = tires.get(script); if (dictionaries == null) { /* * If no matches occured, such as a digit only string, or because the language is ignored, exit with `und`. */ if (!checkLanguage(script, writes, blacks)) { return locales; } locales.add(new DetectionLanguage(Locale.forLanguageTag(script), 1D)); return locales; } /* * Get all distances for a given script, and normalize the distance values. */ // 前后补空格是为了N-Gram处理 text = StringUtility.SPACE + REPLACE.matcher(text).replaceAll(StringUtility.SPACE).toLowerCase() + StringUtility.SPACE; CharacterNgram ngram = new CharacterNgram(3, text); Object2IntMap<CharSequence> tuples = new Object2IntOpenHashMap<>(); for (CharSequence character : ngram) { count = tuples.getInt(character); tuples.put(character, count + 1); } for (DetectionTrie dictionary : dictionaries) { String language = dictionary.getName(); if (checkLanguage(language, writes, blacks)) { double score = getScore(tuples, dictionary.getTrie()); DetectionLanguage locale = new DetectionLanguage(Locale.forLanguageTag(language), score); locales.add(locale); } } if (!locales.isEmpty()) { normalizeScores(text, locales); } return locales; }
Example 6
Source File: WikipediaEdges.java From tagme with Apache License 2.0 | 4 votes |
@Override protected void parseFile(File file) throws IOException { final Int2IntMap redirects = DatasetLoader.get(new RedirectMap(lang)); final IntSet disambiguations = DatasetLoader.get(new DisambiguationWIDs(lang)); final IntSet listpages = DatasetLoader.get(new ListPageWIDs(lang)); final IntSet ignores = DatasetLoader.get(new IgnoreWIDs(lang)); final IntSet valids = new AllWIDs(lang).getDataset();//DatasetLoader.get(new AllWIDs(lang)); valids.removeAll(redirects.keySet()); //valids.removeAll(disambiguations); //valids.removeAll(listpages); valids.removeAll(ignores); final Object2IntMap<String> titles = DatasetLoader.get(new TitlesToWIDMap(lang)); File tmp = Dataset.createTmpFile(); final BufferedWriter out = new BufferedWriter(new FileWriter(tmp)); SQLWikiParser parser = new SQLWikiParser(log) { @Override public boolean compute(ArrayList<String> values) throws IOException { int idFrom = Integer.parseInt(values.get(SQLWikiParser.PAGELINKS_ID_FROM)); if (redirects.containsKey(idFrom)) idFrom = redirects.get(idFrom); int ns = Integer.parseInt(values.get(SQLWikiParser.PAGELINKS_NS)); if (ns == SQLWikiParser.NS_ARTICLE && !redirects.containsKey(idFrom) && !ignores.contains(idFrom) && //questo e' necessario perchè alcune pagine che sono delle liste, in inglese finiscono //tra le pagine di disambiguazione (per via della categoria All_set_index_articles) (listpages.contains(idFrom) || !disambiguations.contains(idFrom)) //!listpages.contains(idFrom) && !disambiguations.contains(idFrom) && valids.contains(idFrom) /**/ ) { String titleTo = Dataset.cleanPageName(values.get(SQLWikiParser.PAGELINKS_TITLE_TO)); int idTo = titles.getInt(titleTo); if (redirects.containsKey(idTo)) idTo = redirects.get(idTo); if (idTo >= 0 && !ignores.contains(idTo) && (listpages.contains(idFrom) || !disambiguations.contains(idFrom)) && valids.contains(idTo)) { out.append(Integer.toString(idFrom)); out.append(SEP_CHAR); out.append(Integer.toString(idTo)); out.append('\n'); return true; } } return false; } }; File input = WikipediaFiles.PAGE_LINKS.getSourceFile(lang); parser.compute(input); out.close(); log.info("Now sorting edges..."); ExternalSort sorter = new ExternalSort(); sorter.setUniq(true); sorter.setNumeric(true); sorter.setColumns(new int[]{0,1}); sorter.setInFile(tmp.getAbsolutePath()); sorter.setOutFile(file.getAbsolutePath()); sorter.run(); tmp.delete(); log.info("Sorted. Done."); }
Example 7
Source File: PTCM.java From AffectiveTweets with GNU General Public License v3.0 | 2 votes |
@Override protected Instances process(Instances instances) throws Exception { Instances result; // The first batch creates de labelled data if(!this.isFirstBatchDone()){ result = getOutputFormat(); for(String word:this.wordInfo.keySet()){ // get the word vector WordRep wordRep=this.wordInfo.get(word); // We just consider valid words if(wordRep.numDoc>=this.minInstDocs){ // a list of lists of tweet vectors ObjectList<ObjectList<Object2IntMap<String>>> partitions=wordRep.partition(this.getPartNumber()); // traverse the partitions for(ObjectList<Object2IntMap<String>> tweetPartition:partitions){ // create one instance per partition double[] values = new double[result.numAttributes()]; // average the vectors of the tweets in the partition // traverse each feature space in the partition for(Object2IntMap<String> wordSpace:tweetPartition){ for(String innerWord:wordSpace.keySet()){ // only include valid words if(this.m_Dictionary.containsKey(innerWord)){ int attIndex=this.m_Dictionary.getInt(innerWord); // we normalize the value by the number of documents values[attIndex]+=((double)wordSpace.getInt(innerWord))/tweetPartition.size(); } } } String wordPol=this.lex.getNomDict().get(word).get(this.polarityAttName); if(wordPol.equals(this.polarityAttNegValName)) values[result.numAttributes()-1]=0; else if(wordPol.equals(this.polarityAttPosValName)) values[result.numAttributes()-1]=1; else values[result.numAttributes()-1]= Utils.missingValue(); Instance inst=new SparseInstance(1, values); inst.setDataset(result); result.add(inst); } } } } // Second batch maps tweets into the original feature space else{ result=this.mapTargetInstance(instances); } return result; }
Example 8
Source File: DistantSupervisionSyntheticFilter.java From AffectiveTweets with GNU General Public License v3.0 | 2 votes |
/** * Maps tweets from the second batch into instances that are compatible with the ones generated * @param inp input Instances * @return convertes Instances */ public Instances mapTargetInstance(Instances inp){ // Creates instances with the same format Instances result=getOutputFormat(); Attribute contentAtt=inp.attribute(this.m_textIndex.getIndex()); for(Instance inst:inp){ String content=inst.stringValue(contentAtt); // tokenizes the content List<String> tokens = affective.core.Utils.tokenize(content, this.toLowerCase, this.standarizeUrlsUsers, this.reduceRepeatedLetters, this.m_tokenizer,this.m_stemmer,this.m_stopwordsHandler); // Identifies the distinct terms AbstractObjectSet<String> terms=new ObjectOpenHashSet<String>(); terms.addAll(tokens); Object2IntMap<String> docVec=this.calculateDocVec(tokens); double[] values = new double[result.numAttributes()]; values[result.classIndex()]= inst.classValue(); for(String att:docVec.keySet()){ if(this.m_Dictionary.containsKey(att)){ int attIndex=this.m_Dictionary.getInt(att); // we normalise the value by the number of documents values[attIndex]=docVec.getInt(att); } } Instance outInst=new SparseInstance(1, values); inst.setDataset(result); result.add(outInst); } return result; }
Example 9
Source File: TweetToSparseFeatureVector.java From AffectiveTweets with GNU General Public License v3.0 | 2 votes |
@Override protected Instances process(Instances instances) throws Exception { Instances result = getOutputFormat(); // if we are in the testing data we calculate the word vectors again if (this.isFirstBatchDone()) { this.tweetsToVectors(instances); } int i = 0; for (Object2IntMap<String> vec : this.procTweets) { double[] values = new double[result.numAttributes()]; // copy previous attributes values for (int n = 0; n < instances.numAttributes(); n++) values[n] = instances.instance(i).value(n); // add words using the frequency as attribute value for (String innerAtt : vec.keySet()) { // we only add the value of valid attributes if (result.attribute(innerAtt) != null){ int attIndex=result.attribute(innerAtt).index(); values[attIndex]=(double)vec.getInt(innerAtt); } } Instance inst=new SparseInstance(1, values); inst.setDataset(result); // copy possible strings, relational values... copyValues(inst, false, instances, result); result.add(inst); i++; } return result; }
Example 10
Source File: AlleleLikelihoods.java From gatk with BSD 3-Clause "New" or "Revised" License | 2 votes |
/** * Returns the index of evidence within a sample evidence-likelihood sub collection. * @param sampleIndex the sample index. * @param evidence the query evidence. * @return -1 if there is no such evidence in that sample, 0 or greater otherwise. */ @VisibleForTesting int evidenceIndex(final int sampleIndex, final EVIDENCE evidence) { final Object2IntMap<EVIDENCE> index = evidenceIndexBySampleIndex(sampleIndex); return index.getInt(evidence); }