cc.mallet.topics.ParallelTopicModel Java Examples

The following examples show how to use cc.mallet.topics.ParallelTopicModel. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EngineMBTopicsLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 6 votes vote down vote up
public void printTopTopicWords(ParallelTopicModel tm, PrintStream pw, int numTopWords) {
  List<Map<String,Double>> perTopicWord2Score = getTopicWordScores(tm);
  for(int topicnr=0; topicnr<tm.numTopics; topicnr++) {
    Map<String,Double> sortedWordScores = perTopicWord2Score.get(topicnr);
    pw.print(topicnr);
    pw.print("(");
    pw.print(String.format(java.util.Locale.US,"%.4f", tm.alpha[topicnr]));
    pw.print(")");
    pw.print(":");
    Iterator<Map.Entry<String,Double>> it = sortedWordScores.entrySet().iterator();
    for(int i=0; i<numTopWords; i++) {
      if(it.hasNext()) {
        Entry<String,Double> entry = it.next();
        pw.print(" ");
        pw.print(entry.getKey());
        pw.print(":");
        pw.print(String.format(java.util.Locale.US,"%.4f", entry.getValue()));
      } else {
        break;
      }
    }
    pw.println();
  }
}
 
Example #2
Source File: TopicModelTrainer.java    From baleen with Apache License 2.0 6 votes vote down vote up
private void writeTopicAssignmentsToMongo(
    InstanceList instances, TopicWords topicWords, ParallelTopicModel model) {
  IntStream.range(0, instances.size())
      .forEach(
          document -> {
            double[] topicDistribution = model.getTopicProbabilities(document);
            int maxAt = new MaximumIndex(topicDistribution).find();
            Instance instance = instances.get(document);

            List<String> iterator = topicWords.forTopic(maxAt);

            documentsCollection.findOneAndUpdate(
                Filters.eq(new ObjectId((String) instance.getName())),
                Updates.set(
                    TOPIC_FIELD,
                    new Document()
                        .append(KEYWORDS_FIELD, iterator.toString())
                        .append(TOPIC_NUMBER_FIELD, maxAt)));
          });
}
 
Example #3
Source File: MalletCalculator.java    From TagRec with GNU Affero General Public License v3.0 6 votes vote down vote up
public void predictValuesProbs(boolean topicCreation) {
	ParallelTopicModel LDA = new ParallelTopicModel(this.numTopics, ALPHA * this.numTopics, BETA); // TODO
	LDA.addInstances(this.instances);
	LDA.setNumThreads(1);
	LDA.setNumIterations(NUM_ITERATIONS);
	LDA.setRandomSeed(43);
	try {
		LDA.estimate();
	} catch (Exception e) {
		e.printStackTrace();
	}
	this.docList = getMaxTopicsByDocs(LDA, this.numTopics);
	System.out.println("Fetched Doc-List");
	this.topicList = !topicCreation ? getMaxTermsByTopics(LDA, MAX_TERMS) : null;
	System.out.println("Fetched Topic-List");
}
 
Example #4
Source File: MalletCalculatorTweet.java    From TagRec with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * What does this boolean value signify.
 * @param topicCreation
 */
public void predictValuesProbs(boolean topicCreation) {
    
    ParallelTopicModel LDA = new ParallelTopicModel(this.numTopics, ALPHA * this.numTopics, BETA); // TODO
    LDA.addInstances(this.instances);
    LDA.setNumThreads(1);
    LDA.setNumIterations(NUM_ITERATIONS);
    LDA.setRandomSeed(43);
    try {
        LDA.estimate();
    } catch (Exception e) {
        e.printStackTrace();
    }
    this.docList = getMaxTopicsByDocs(LDA, this.numTopics);
    System.out.println("Fetched Doc-List");
    this.topicList = !topicCreation ? getMaxTermsByTopics(LDA, MAX_TERMS) : null;
    System.out.println("Fetched Topic-List");
}
 
Example #5
Source File: EngineMBTopicsLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 5 votes vote down vote up
@Override
protected void loadModel(URL directory, String parms) {
  URL modelFile = newURL(directory, FILENAME_MODEL);
  Classifier classifier;
  try (InputStream is = modelFile.openStream();
       ObjectInputStream ois = new ObjectInputStream(is)) {
    ParallelTopicModel ptm = (ParallelTopicModel) ois.readObject();
    model=ptm;
  } catch (IOException | ClassNotFoundException ex) {
    throw new GateRuntimeException("Could not load Mallet model", ex);
  }
}
 
Example #6
Source File: TopicModel.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
  super.doInitialize(aContext);

  try {
    model = ParallelTopicModel.read(new File(modelPath));
    pipe = new TopicModelPipe(stopwordResource.getStopwords(stoplist), model.getAlphabet());
    topicWords = new TopicWords(model);
  } catch (Exception e) {
    throw new ResourceInitializationException();
  }
}
 
Example #7
Source File: MalletCalculator.java    From TagRec with GNU Affero General Public License v3.0 5 votes vote down vote up
private List<Map<Integer, Double>> getMaxTopicsByDocs(ParallelTopicModel LDA, int maxTopicsPerDoc) {
	List<Map<Integer, Double>> docList = new ArrayList<Map<Integer, Double>>();
	Map<Integer, Double> unsortedMostPopularTopics = new LinkedHashMap<Integer, Double>();
       int numDocs = this.instances.size();
       for (int doc = 0; doc < numDocs; ++doc) {
       	Map<Integer, Double> topicList = new LinkedHashMap<Integer, Double>();
       	double[] topicProbs = LDA.getTopicProbabilities(doc);
       	//double probSum = 0.0;
       	for (int topic = 0; topic < topicProbs.length && topic < maxTopicsPerDoc; topic++) {
       		if (topicProbs[topic] > TOPIC_THRESHOLD) { // TODO
       			double newTopicProb = topicProbs[topic];
       			topicList.put(topic, newTopicProb);
       			Double oldTopicProb = unsortedMostPopularTopics.get(topic);
       			unsortedMostPopularTopics.put(topic, oldTopicProb == null ? newTopicProb : oldTopicProb.doubleValue() + newTopicProb);
       			//probSum += topicProbs[topic];
       		}
       	}
		//System.out.println("Topic Sum: " + probSum);
       	Map<Integer, Double> sortedTopicList = new TreeMap<Integer, Double>(new DoubleMapComparator(topicList));
       	sortedTopicList.putAll(topicList);
       	docList.add(sortedTopicList);
       }
       
       Map<Integer, Double> sortedMostPopularTopics = new TreeMap<Integer, Double>(new DoubleMapComparator(unsortedMostPopularTopics));
       sortedMostPopularTopics.putAll(unsortedMostPopularTopics);
       for (Map.Entry<Integer, Double> entry : sortedMostPopularTopics.entrySet()) {
       	if (this.mostPopularTopics.size() < MAX_RECOMMENDATIONS) {
       		this.mostPopularTopics.put(entry.getKey(), entry.getValue());
       	}
       }
       
	return docList;
}
 
Example #8
Source File: MalletCalculatorTweet.java    From TagRec with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * What does this function returns.
 * @param LDA
 * @param maxTopicsPerDoc
 * @return
 */
private List<Map<Integer, Double>> getMaxTopicsByDocs(ParallelTopicModel LDA, int maxTopicsPerDoc){

    List<Map<Integer, Double>> docList = new ArrayList<Map<Integer, Double>>();
    Map<Integer, Double> unsortedMostPopularTopics = new LinkedHashMap<Integer, Double>();
    int numDocs = this.instances.size();
    for (int doc = 0; doc < numDocs; ++doc) {
        Map<Integer, Double> topicList = new LinkedHashMap<Integer, Double>();
        double[] topicProbs = LDA.getTopicProbabilities(doc);
        //double probSum = 0.0;
        for (int topic = 0; topic < topicProbs.length && topic < maxTopicsPerDoc; topic++) {
            if (topicProbs[topic] > TOPIC_THRESHOLD) { // TODO
                double newTopicProb = topicProbs[topic];
                topicList.put(topic, newTopicProb);
                Double oldTopicProb = unsortedMostPopularTopics.get(topic);
                unsortedMostPopularTopics.put(topic, oldTopicProb == null ? newTopicProb : oldTopicProb.doubleValue() + newTopicProb);
                //probSum += topicProbs[topic];
            }
        }
        //System.out.println("Topic Sum: " + probSum);
        Map<Integer, Double> sortedTopicList = new TreeMap<Integer, Double>(new DoubleMapComparator(topicList));
        sortedTopicList.putAll(topicList);
        docList.add(sortedTopicList);
    }
    
    
    Map<Integer, Double> sortedMostPopularTopics = new TreeMap<Integer, Double>(new DoubleMapComparator(unsortedMostPopularTopics));
    sortedMostPopularTopics.putAll(unsortedMostPopularTopics);
    for (Map.Entry<Integer, Double> entry : sortedMostPopularTopics.entrySet()) {
        if (this.mostPopularTopics.size() < MAX_RECOMMENDATIONS) {
            this.mostPopularTopics.put(entry.getKey(), entry.getValue());
        }
    }
    
    return docList;
}
 
Example #9
Source File: LDA.java    From topic-detection with Apache License 2.0 5 votes vote down vote up
/**
 * Creates the LDA model on the specified document corpus
 * @param texts a list of documents
 * @param numTopics the number of desired documents
 * @param numIterations the number of LDA iterationss
 * @return An LDA topic model
 * @throws IOException
 */
private ParallelTopicModel createLDAModel(List<String> texts, int numTopics, int numIterations) throws IOException
{
	InstanceList instanceList = createInstanceList(texts);
	ParallelTopicModel model = new ParallelTopicModel(numTopics);
	model.addInstances(instanceList);
	model.setNumIterations(numIterations);
	model.estimate();
	return model;
}
 
Example #10
Source File: EngineMBTopicsLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
public ParallelTopicModel getTopicModel() {
  return tm;
}
 
Example #11
Source File: EngineMBTopicsLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
public List<Map<String,Double>>  getTopicWordScores(ParallelTopicModel tm) {
  List<Map<String,Double>> perTopicWord2Score = new ArrayList<>(tm.numTopics);
  for(int i=0; i<tm.numTopics; i++) {
    perTopicWord2Score.add(new HashMap<>());
  }
  // modified from Mallet ParallelTopicModel code
  for(int topicnr=0; topicnr<tm.numTopics; topicnr++) {
    for(int type=0; type<tm.numTypes; type++) {
      int[] topicCounts=tm.typeTopicCounts[type];
      double weight = tm.beta;
      int index = 0;
      while (index < topicCounts.length && topicCounts[index] > 0) {
        // Mallet really stores both the count and the topic number in the 
        // topicCounts variable: the actual counts are in the highest bits 
        // while the topic number is in as many lowest bits as necessary.
        // The topic mask is ones for those lowest bits.
        int currentTopic = topicCounts[index] & tm.topicMask;
        if(currentTopic == topicnr) {
          weight += topicCounts[index] >> tm.topicBits;  // get the actual count
          break;
        }
        index++;
      }
      perTopicWord2Score.get(topicnr).put((String)tm.alphabet.lookupObject(type), weight);
    }
  }
  
  for(int i=0; i<perTopicWord2Score.size(); i++) {
    Map<String,Double> unsortedMap = perTopicWord2Score.get(i);
    // calculate sum
    double sum = unsortedMap.entrySet().stream().mapToDouble(x -> x.getValue()).sum();
    // remap values to 0..1, NOTE: sum should always be > 0!
    unsortedMap.entrySet().stream().forEach(x -> unsortedMap.put(x.getKey(), x.getValue()/sum));
    // sort and store result in a linked hash map
    Map<String,Double> sortedMap = new LinkedHashMap<>();
    unsortedMap.entrySet().stream().
            sorted(Map.Entry.<String, Double>comparingByValue().reversed()).
            forEachOrdered(x -> sortedMap.put(x.getKey(), x.getValue()));
    // store the sorted map instead of the original one int th result
    perTopicWord2Score.set(i, sortedMap);
  }
  return perTopicWord2Score;
}
 
Example #12
Source File: EngineMBTopicsLDA.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
public void applyTopicModel(AnnotationSet instanceAS, AnnotationSet tokenAS,
        String tokenFeature, String featurePrefix, String parms) {
  CorpusRepresentationMalletLDA data = (CorpusRepresentationMalletLDA)corpusRepresentation;
  data.stopGrowth();

  int numIterations = 10;
  int burnIn = 10;
  int thinning = 0;
  int seed = 0;
  Parms parmdef = new Parms(parms,
              "i:iters:i",
              "B:burnin:i",
              "T:thinning:i",
              "s:seed:i"
  );
  numIterations = (int) parmdef.getValueOrElse("iters", numIterations);
  burnIn = (int) parmdef.getValueOrElse("burnin", burnIn);
  thinning = (int) parmdef.getValueOrElse("thinning", thinning);
  seed = (int) parmdef.getValueOrElse("seed", seed);


  ParallelTopicModel tm = (ParallelTopicModel)model;
  TopicInferencer ti = tm.getInferencer();
  tm.setRandomSeed(seed);
  
  for(Annotation instAnn : instanceAS.inDocumentOrder()) {
    // System.err.println("DEBUG: adding instance annotation "+instAnn);
    Instance inst = data.getInstanceFor(gate.Utils.start(instAnn), gate.Utils.end(instAnn), tokenAS, tokenFeature);
    // System.err.println("DEBUG: Instance data is "+inst.getData());
    // System.err.println("DEBUG: got inferencer "+ti);
    // NOTE: see http://mallet.cs.umass.edu/api/cc/mallet/topics/TopicInferencer.html#getSampledDistribution(cc.mallet.types.Instance,%20int,%20int,%20int)
    double[] tdist = ti.getSampledDistribution(inst, numIterations, thinning, burnIn);
    List<Double> tdistlist = new ArrayList<>(tdist.length);
    int i = 0;
    int bestTopic = -1;
    double bestProb = -999.99;
    for(double val : tdist) {
      tdistlist.add(val);
      if(val > bestProb) {
        bestTopic = i;
        bestProb = val;
      }
      i++;
    }
    if(featurePrefix == null) {
      featurePrefix = "";
    }
    instAnn.getFeatures().put(featurePrefix+"TopicDist", tdistlist);    
    // Also add a feature that gives the index and word list of the most likely topic
    instAnn.getFeatures().put(featurePrefix+"BestTopic", bestTopic);
    instAnn.getFeatures().put(featurePrefix+"BestTopicProb", bestProb);
    // TODO: to add the topic words we have to pre-calculate the top k words for each topic
    // and assign the list for topic k here!
    // instAnn.getFeatures().put("LF_MBTopicsLDA_MLTopicWords", bestProb);            
  }
}
 
Example #13
Source File: LF_TrainTopicModel.java    From gateplugin-LearningFramework with GNU Lesser General Public License v2.1 4 votes vote down vote up
@Override
public void controllerFinished(Controller arg0, Throwable t) {
  if(t!=null) {
    System.err.println("An exception occurred during processing of documents, no training will be done");
    System.err.println("Exception was "+t.getClass()+": "+t.getMessage());
    return;
  }
  if(getSeenDocuments().get()==0) {
    throw new GateRuntimeException("No documents seen, cannot train");
  }
  if (getDuplicateId() == 0) {
    System.out.println("LearningFramework: Starting training engine " + engine);
    if (corpusRepresentation instanceof CorpusRepresentationMallet) {
      CorpusRepresentationMallet crm = (CorpusRepresentationMallet) corpusRepresentation;
      System.out.println("Training set size: " + crm.getRepresentationMallet().size());
      if (crm.getRepresentationMallet().getDataAlphabet().size() > 20) {
        System.out.println("LearningFramework: Attributes " + crm.getRepresentationMallet().getDataAlphabet().size());
      } else {
        System.out.println("LearningFramework: Attributes " + crm.getRepresentationMallet().getDataAlphabet().toString().replaceAll("\\n", " "));
      }
    }

    engine.getInfo().nrTrainingInstances = corpusRepresentation.nrInstances();

    // Store some additional information in the info datastructure which will be saved with the model
    engine.getInfo().nrTrainingDocuments = getSeenDocuments().get();
    engine.getInfo().targetFeature = null;
    engine.getInfo().trainingCorpusName = corpus.getName();

    engine.trainModel(gate.util.Files.fileFromURL(dataDirectory),
            getInstanceType(),
            getAlgorithmParameters());
    engine.saveEngine(dataDirFile);
    
    // Now, if apply to training set was specified, AND we used the Mallet algorithm AND we only have one duplicate
    // AND the number of we just processed agrees with the number of documents in the corpus (i.e. we do not 
    // have a GCP-like process-one-document-at-a-time situation)
    // calculate the topic distribution for each of the documents.
    // CAUTION: this assumes that the order of documents in the corpus still agrees with the 
    // order of documents in the mallet model.
    if (trainingAlgorithm == AlgorithmClustering.MalletLDA_CLUS_MR && getApplyAfterTraining()) {
      EngineMBTopicsLDA engine_mbt = (EngineMBTopicsLDA)engine;
      ParallelTopicModel tm = engine_mbt.getTopicModel();
      if(nDuplicates.get() == 1 && corpus.size() == getSeenDocuments().get()) {
        System.out.println("INFO: re-processing corpus for application...");
        // List<TopicAssignment> tass = tm.getData();
        int n = 0; // this is the running index of the instances as seen by Mallet
        for(int docNr=0; docNr < corpus.size(); docNr++) {
          boolean documentWasLoaded = corpus.isDocumentLoaded(docNr);
          Document doc = corpus.get(docNr);
          AnnotationSet inputAS = doc.getAnnotations(getInputASName());            
          AnnotationSet instanceAS;
          if (getInstanceType() != null && !getInstanceType().isEmpty()) {
            instanceAS = inputAS.get(getInstanceType());
          } else {     
            // if the instance annotation set has not been specified, we put a Document annotation
            // into the inputAS, unless we already have one or more
            instanceAS = inputAS.get("Document");
            if (instanceAS.isEmpty()) {
              gate.Utils.addAnn(inputAS, 0, doc.getContent().size(), "Document", Factory.newFeatureMap());
              instanceAS = inputAS.get("Document");
            }
          }
          for (Annotation instAnn : instanceAS) {
            double[] tdist = tm.getTopicProbabilities(n);
            List<Double> tdistlist = new ArrayList<>(tdist.length);
            int i = 0;
            int bestTopic = -1;
            double bestProb = -999.99;
            for (double val : tdist) {
              tdistlist.add(val);
              if (val > bestProb) {
                bestTopic = i;
                bestProb = val;
              }
              i++;
            }
            String pref = getFeaturePrefix();
            if(pref==null) {
              pref = "";
            }
            instAnn.getFeatures().put(pref+"TopicDist", tdistlist);
            // Also add a feature that gives the index and word list of the most likely topic
            instAnn.getFeatures().put(pref+"BestTopic", bestTopic);
            instAnn.getFeatures().put(pref+"BestTopicProb", bestProb);              
            n++;
          }
          if(!documentWasLoaded) {
            corpus.unloadDocument(doc);
            Factory.deleteResource(doc);
          }
        }
        System.out.println("INFO: re-processing corpus for application finished.");
      } else {
        System.err.println("ERROR: cannot apply after training, either more than one duplicate or corpus size mismatch");
      }
      
    }
  }
}
 
Example #14
Source File: LDAModelEstimator.java    From RankSys with Mozilla Public License 2.0 3 votes vote down vote up
/**
 * Estimate a topic model for collaborative filtering data.
 *
 * @param <U> user type
 * @param <I> item type
 * @param preferences preference data
 * @param k number of topics
 * @param alpha alpha in model
 * @param beta beta in model
 * @param numIterations number of iterations
 * @param burninPeriod burnin period
 * @return a topic model
 * @throws IOException when internal IO error occurs
 */
public static <U, I> ParallelTopicModel estimate(FastPreferenceData<U, I> preferences, int k, double alpha, double beta, int numIterations, int burninPeriod) throws IOException {
    
    ParallelTopicModel topicModel = new ParallelTopicModel(k, alpha * k, beta);
    topicModel.addInstances(new LDAInstanceList<>(preferences));
    topicModel.setTopicDisplay(numIterations + 1, 0);
    topicModel.setNumIterations(numIterations);
    topicModel.setBurninPeriod(burninPeriod);
    topicModel.setNumThreads(Runtime.getRuntime().availableProcessors());

    topicModel.estimate();

    return topicModel;
}
 
Example #15
Source File: LDARecommender.java    From RankSys with Mozilla Public License 2.0 2 votes vote down vote up
/**
 * Constructor
 *
 * @param uIndex user index
 * @param iIndex item index
 * @param topicModel LDA topic model
 */
public LDARecommender(FastUserIndex<U> uIndex, FastItemIndex<I> iIndex, ParallelTopicModel topicModel) {
    super(uIndex, iIndex);
    this.topicModel = topicModel;
}
 
Example #16
Source File: TopicWords.java    From baleen with Apache License 2.0 2 votes vote down vote up
/**
 * Create Topic word factory for the given model
 *
 * @param model
 */
public TopicWords(ParallelTopicModel model) {
  this.model = model;
}