org.carrot2.core.Document Java Examples

The following examples show how to use org.carrot2.core.Document. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ConsoleFormatter.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void displayResults(ProcessingResult processingResult)
{
    final Collection<Document> documents = processingResult.getDocuments();
    final Collection<Cluster> clusters = processingResult.getClusters();
    final Map<String, Object> attributes = processingResult.getAttributes();

    // Show documents
    if (documents != null)
    {
        displayDocuments(documents);
    }

    // Show clusters
    if (clusters != null)
    {
        displayClusters(clusters);
    }

    // Show attributes other attributes
    displayAttributes(attributes);
}
 
Example #2
Source File: CommitsMessageTopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private void storeCommitsMessagesTopics(List<Cluster> commitsMessagesTopics, VcsRepositoryDelta vcsRepositoryDelta, CommitsMessageTopicsTransMetric db) {
	db.getCommitsTopics().getDbCollection().drop();
	for (Cluster cluster : commitsMessagesTopics) {
		CommitsTopic commitsTopic = new CommitsTopic();
		db.getCommitsTopics().add(commitsTopic);
		commitsTopic.setRepository(vcsRepositoryDelta.getRepository().getUrl());
		commitsTopic.getLabels().addAll(cluster.getPhrases());
		commitsTopic.setNumberOfMessages(cluster.getAllDocuments().size());
		for(Document document : cluster.getDocuments())
		{
			String[] uid = document.getStringId().split("\t");
			commitsTopic.getCommitsMessageId().add(uid[1]);
		}
	}
	db.sync();
}
 
Example #3
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private void storeBugTrackerTopics(List<Cluster> bugTrackerTopics, BugTrackingSystemDelta btspDelta,
		TopicsTransMetric db) {
	db.getBugTrackerTopics().getDbCollection().drop();
	for (Cluster cluster : bugTrackerTopics) {
		BugTrackerTopic bugTrackerTopic = new BugTrackerTopic();
		db.getBugTrackerTopics().add(bugTrackerTopic);
		bugTrackerTopic.setBugTrackerId(btspDelta.getBugTrackingSystem().getOSSMeterId());
		bugTrackerTopic.getLabels().addAll(cluster.getPhrases());
		bugTrackerTopic.setNumberOfDocuments(cluster.getAllDocuments().size());
		for(Document document : cluster.getAllDocuments())
		{
			String[] uid = document.getStringId().split("\t");
			bugTrackerTopic.getCommentsId().add(uid[1]+"\t"+uid[2]);
		}
	}
	db.sync();
}
 
Example #4
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private void storeNewsgroupTopics(List<Cluster> newsgroupTopics, CommunicationChannelDelta ccpDelta,
		TopicsTransMetric db) {
	db.getNewsgroupTopics().getDbCollection().drop();
	for (Cluster cluster : newsgroupTopics) {
		NewsgroupTopic newsgroupTopic = new NewsgroupTopic();
		db.getNewsgroupTopics().add(newsgroupTopic);
		CommunicationChannel communicationChannel = ccpDelta.getCommunicationChannel();
		newsgroupTopic.setNewsgroupName(communicationChannel.getOSSMeterId());
		newsgroupTopic.getLabels().addAll(cluster.getPhrases());
		newsgroupTopic.setNumberOfDocuments(cluster.getAllDocuments().size());
		for(Document document : cluster.getAllDocuments())
		{
			String[] uid = document.getStringId().split("\t");
			newsgroupTopic.getArticlesId().add(Long.valueOf(uid[1]));
		}
	}
	db.sync();
}
 
Example #5
Source File: SavingResultsToJson.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serializeJson(new PrintWriter(System.out));
    System.out.println();

    // Optionally, we can provide a callback for JSON-P-style calls
    result.serializeJson(
        new PrintWriter(System.out), "loadResults",
        true /* indent */, 
        false /* save documents */, 
        true /* save clusters */);

}
 
Example #6
Source File: SavingResultsToXml.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serialize(System.out);
    System.out.println();

    // Optionally, we can choose whether we want to serialize documents and clusters
    result.serialize(System.out, 
        false /* don't save documents */,
        true /* save clusters */);
}
 
Example #7
Source File: EchoClusteringAlgorithm.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void process() throws ProcessingException {
  clusters = new ArrayList<>();
  
  for (Document document : documents) {
    final Cluster cluster = new Cluster();
    cluster.addPhrases(document.getTitle(), document.getSummary());
    if (document.getLanguage() != null) {
      cluster.addPhrases(document.getLanguage().name());
    }
    for (String field : customFields.split(",")) {
      Object value = document.getField(field);
      if (value != null) {
        cluster.addPhrases(value.toString());
      }
    }
    cluster.addDocuments(document);
    clusters.add(cluster);
  }
}
 
Example #8
Source File: ConsoleFormatter.java    From DistributedCrawler with Apache License 2.0 6 votes vote down vote up
/**
 * 对processingResult进行全面的展示,输出至控制台.
 * @author GS
 * @param processingResult
 */
public static void displayResults(ProcessingResult processingResult)
{
    final Collection<Document> documents = processingResult.getDocuments();//所有的文档
    final Collection<Cluster> clusters = processingResult.getClusters();//所有的类别
    final Map<String, Object> attributes = processingResult.getAttributes();//参数

    // Show documents
    if (documents != null)
    {
        displayDocuments(documents);//打印所有文档
    }

    // Show clusters
    if (clusters != null)
    {
        displayClusters(clusters);//打印所有分类
    }

    // Show attributes other attributes
    displayAttributes(attributes);//打印参数
}
 
Example #9
Source File: CustomConsoleFormatter.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void displayResults(ProcessingResult processingResult)
{
    final Collection<Document> documents = processingResult.getDocuments();
    final Collection<Cluster> clusters = processingResult.getClusters();
    final Map<String, Object> attributes = processingResult.getAttributes();

    // Show documents
    if (documents != null)
    {
        displayDocuments(documents);
    }

    // Show clusters
    if (clusters != null)
    {
        displayClusters(clusters);
    }

    // Show attributes other attributes
    displayAttributes(attributes);
}
 
Example #10
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 6 votes vote down vote up
/**
 * 对所有的PagePOJO进行聚类
 * 
 * @author GS
 * @return
 * @throws IOException
 * @throws Exception
 */
public ProcessingResult cluster(String docPath) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	JsonReader jr = new JsonReader(new File(docPath));
	while (jr.hasNext()) {
		Hit h = jr.next();
		documents.add(new Document(h.getPagePOJO().getTitle(), h
				.getPagePOJO().getContent()));
	}
	jr.close();
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	return englishResult;
}
 
Example #11
Source File: ConsoleFormatter.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 显示Collection里面的每一个文档,显示标题和URL
 * @author GS
 * @param documents
 */
public static void displayDocuments(final Collection<Document> documents)
{
    System.out.println("Collected " + documents.size() + " documents\n");//所有的文档总数
    for (final Document document : documents)
    {
        displayDocument(0, document);//显示单个文档,包括显示标题和URL
    }
}
 
Example #12
Source File: ConsoleFormatter.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 展示单个文档
 * @author GS
 * @param level
 * @param document
 */
private static void displayDocument(final int level, Document document)//展示每一个文档
{
    final String indent = getIndent(level);

    System.out.printf(indent + "[%2s] ", document.getStringId());//打印文档ID号
    System.out.println(document.getField(Document.TITLE));//打印标题
    final String url = document.getField(Document.CONTENT_URL);//正文URL
    if (StringUtils.isNotBlank(url))//如果document里面带有正文的URL则打印
    {
        System.out.println(indent + "     " + url);
    }
    System.out.println();
}
 
Example #13
Source File: ConsoleFormatter.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 对一个类进行展示.
 * @author GS
 * @param level
 * @param tag
 * @param cluster
 * @param maxNumberOfDocumentsToShow
 * @param clusterDetailsFormatter
 */
private static void displayCluster(final int level, String tag, Cluster cluster,
    int maxNumberOfDocumentsToShow, ClusterDetailsFormatter clusterDetailsFormatter)
{
    final String label = cluster.getLabel();//当前类的标题

    // indent up to level and display this cluster's description phrase
    for (int i = 0; i < level; i++)
    {
        System.out.print("  ");
    }
    System.out.println(label + "  "
        + clusterDetailsFormatter.formatClusterDetails(cluster));

    // if this cluster has documents, display three topmost documents.
    int documentsShown = 0;
    for (final Document document : cluster.getDocuments())
    {
        if (documentsShown >= maxNumberOfDocumentsToShow)//如果达到最大展示数的话不再展示
        {
            break;
        }
        displayDocument(level + 1, document);//这个level是干嘛的?
        documentsShown++;//当前分类已经展示的文档数
    }
    if (maxNumberOfDocumentsToShow > 0
        && (cluster.getDocuments().size() > documentsShown))
    {
        System.out.println(getIndent(level + 1) + "... and "
            + (cluster.getDocuments().size() - documentsShown) + " more\n");
    }

    // finally, if this cluster has subclusters, descend into recursion.
    final int num = 1;
    for (final Cluster subcluster : cluster.getSubclusters())
    {
        displayCluster(level + 1, tag + "." + num, subcluster,
            maxNumberOfDocumentsToShow, clusterDetailsFormatter);
    }
}
 
Example #14
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 对指定的PagePOJO进行聚类
 * 
 * @author GS
 * @param list
 *            PagePOJO List
 * @return ProcessingResult类,调用需要的方法即可.
 * @throws IOException
 * @throws Exception
 */
public Map<String,List<String>> cluster(List<PagePOJO> list) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	Iterator<PagePOJO> it = list.iterator();
	while (it.hasNext()) {
		PagePOJO pojo = it.next();
		documents.add(new Document(pojo.getTitle(), pojo.getContent(),LanguageCode.CHINESE_SIMPLIFIED));
	}
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	for (org.carrot2.core.Cluster c : englishResult.getClusters()) {
		LinkedList<String> value = new LinkedList<String>(); 
		for (Document d : c.getAllDocuments()) {
			value.add(d.getField(Document.TITLE).toString());
		}
		result.put(c.getLabel(), value);
	}
	return result;
}
 
Example #15
Source File: CarrotClusteringEngine.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Object cluster(Query query, SolrDocumentList solrDocList,
    Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
  try {
    // Prepare attributes for Carrot2 clustering call
    Map<String, Object> attributes = new HashMap<>();
    List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
    attributes.put(AttributeNames.DOCUMENTS, documents);
    attributes.put(AttributeNames.QUERY, query.toString());

    // Pass the fields on which clustering runs.
    attributes.put("solrFieldNames", getFieldsForClustering(sreq));

    // Pass extra overriding attributes from the request, if any
    extractCarrotAttributes(sreq.getParams(), attributes);

    // Perform clustering and convert to an output structure of clusters.
    //
    // Carrot2 uses current thread's context class loader to get
    // certain classes (e.g. custom tokenizer/stemmer) at runtime.
    // To make sure classes from contrib JARs are available,
    // we swap the context class loader for the time of clustering.
    return withContextClassLoader(core.getResourceLoader().getClassLoader(),
        () -> clustersToNamedList(controller.process(attributes,
            clusteringAlgorithmClass).getClusters(), sreq.getParams()));
  } catch (Exception e) {
    log.error("Carrot2 clustering failed", e);
    throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
  }
}
 
Example #16
Source File: ConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
public static void displayDocuments(final Collection<Document> documents)
{
    System.out.println("Collected " + documents.size() + " documents\n");
    for (final Document document : documents)
    {
        displayDocument(0, document);
    }
}
 
Example #17
Source File: CommitsMessageTopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #18
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #19
Source File: CustomConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static void displayCluster(final int level, String tag, Cluster cluster,
    int maxNumberOfDocumentsToShow, ClusterDetailsFormatter clusterDetailsFormatter)
{
    final String label = cluster.getLabel();

    // indent up to level and display this cluster's description phrase
    for (int i = 0; i < level; i++)
    {
        System.out.print("  ");
    }
    System.out.println(label + "  "
        + clusterDetailsFormatter.formatClusterDetails(cluster));

    // if this cluster has documents, display three topmost documents.
    int documentsShown = 0;
    for (final Document document : cluster.getDocuments())
    {
        if (documentsShown >= maxNumberOfDocumentsToShow)
        {
            break;
        }
        displayDocument(level + 1, document);
        documentsShown++;
    }
    if (maxNumberOfDocumentsToShow > 0
        && (cluster.getDocuments().size() > documentsShown))
    {
        System.out.println(getIndent(level + 1) + "... and "
            + (cluster.getDocuments().size() - documentsShown) + " more\n");
    }

    // finally, if this cluster has subclusters, descend into recursion.
    final int num = 1;
    for (final Cluster subcluster : cluster.getSubclusters())
    {
        displayCluster(level + 1, tag + "." + num, subcluster,
            maxNumberOfDocumentsToShow, clusterDetailsFormatter);
    }
}
 
Example #20
Source File: CustomConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static void displayDocument(final int level, Document document)
{
    final String indent = getIndent(level);

    System.out.printf(indent + "[%2s] ", document.getStringId());
    System.out.println(document.getField(Document.TITLE));
    final String url = document.getField(Document.CONTENT_URL);
    if (StringUtils.isNotBlank(url))
    {
        System.out.println(indent + "     " + url);
    }
    System.out.println();
}
 
Example #21
Source File: CustomConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
public static void displayDocuments(final Collection<Document> documents)
{
    System.out.println("Collected " + documents.size() + " documents\n");
    for (final Document document : documents)
    {
        displayDocument(0, document);
    }
}
 
Example #22
Source File: ConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static void displayCluster(final int level, String tag, Cluster cluster,
    int maxNumberOfDocumentsToShow, ClusterDetailsFormatter clusterDetailsFormatter)
{
    final String label = cluster.getLabel();

    // indent up to level and display this cluster's description phrase
    for (int i = 0; i < level; i++)
    {
        System.out.print("  ");
    }
    System.out.println(label + "  "
        + clusterDetailsFormatter.formatClusterDetails(cluster));

    // if this cluster has documents, display three topmost documents.
    int documentsShown = 0;
    for (final Document document : cluster.getDocuments())
    {
        if (documentsShown >= maxNumberOfDocumentsToShow)
        {
            break;
        }
        displayDocument(level + 1, document);
        documentsShown++;
    }
    if (maxNumberOfDocumentsToShow > 0
        && (cluster.getDocuments().size() > documentsShown))
    {
        System.out.println(getIndent(level + 1) + "... and "
            + (cluster.getDocuments().size() - documentsShown) + " more\n");
    }

    // finally, if this cluster has subclusters, descend into recursion.
    final int num = 1;
    for (final Cluster subcluster : cluster.getSubclusters())
    {
        displayCluster(level + 1, tag + "." + num, subcluster,
            maxNumberOfDocumentsToShow, clusterDetailsFormatter);
    }
}
 
Example #23
Source File: ConsoleFormatter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private static void displayDocument(final int level, Document document)
{
    final String indent = getIndent(level);

    System.out.printf(indent + "[%2s] ", document.getStringId());
    System.out.println(document.getField(Document.TITLE));
    final String url = document.getField(Document.CONTENT_URL);
    if (StringUtils.isNotBlank(url))
    {
        System.out.println(indent + "     " + url);
    }
    System.out.println();
}
 
Example #24
Source File: CommitsMessageTopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
private List<Cluster> produceCommitsMessagesTopics(CommitsMessageTopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (CommitMessage commitMessage : db.getCommitsMessages())
		documents.add(new Document(commitMessage.getSubject(), commitMessage.getMessage(), "", LanguageCode.ENGLISH, produceUID(commitMessage)));
	return produceTopics(documents);
}
 
Example #25
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
private List<Cluster> produceBugTrackerTopics(TopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (BugTrackerCommentsData comment : db.getBugTrackerComments())
		documents.add(new Document(comment.getSubject(), comment.getText(), "", LanguageCode.ENGLISH, produceUID(comment)));
	return produceTopics(documents);
}
 
Example #26
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
private List<Cluster> produceNewsgroupTopics(TopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (NewsgroupArticlesData article : db.getNewsgroupArticles())
		documents.add(new Document(article.getSubject(), article.getText(), "", LanguageCode.ENGLISH, produceUID(article)));
	return produceTopics(documents);
}
 
Example #27
Source File: CarrotClusteringEngine.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void clustersToNamedList(List<Cluster> outputClusters,
                                 List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
  for (Cluster outCluster : outputClusters) {
    NamedList<Object> cluster = new SimpleOrderedMap<>();
    parent.add(cluster);

    // Add labels
    List<String> labels = outCluster.getPhrases();
    if (labels.size() > maxLabels) {
      labels = labels.subList(0, maxLabels);
    }
    cluster.add("labels", labels);

    // Add cluster score
    final Double score = outCluster.getScore();
    if (score != null) {
      cluster.add("score", score);
    }

    // Add other topics marker
    if (outCluster.isOtherTopics()) {
      cluster.add("other-topics", outCluster.isOtherTopics());
    }

    // Add documents
    List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
    List<Object> docList = new ArrayList<>();
    cluster.add("docs", docList);
    for (Document doc : docs) {
      docList.add(doc.getField(SOLR_DOCUMENT_ID));
    }

    // Add subclusters
    if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
      List<NamedList<Object>> subclusters = new ArrayList<>();
      cluster.add("clusters", subclusters);
      clustersToNamedList(outCluster.getSubclusters(), subclusters,
              outputSubClusters, maxLabels);
    }
  }
}
 
Example #28
Source File: ClusteringDocumentList.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args)
{
    /* [[[start:clustering-document-list-intro]]]
     * 
     * <div>
     * <p>
     * The easiest way to get started with Carrot2 is to cluster a collection
     * of {@link org.carrot2.core.Document}s. Each document can consist of:
     * </p>
     * 
     * <ul>
     * <li>document content: a query-in-context snippet, document abstract or full text,</li>
     * <li>document title: optional, some clustering algorithms give more weight to document titles,</li>
     * <li>document URL: optional, used by the {@link org.carrot2.clustering.synthetic.ByUrlClusteringAlgorithm}, 
     * ignored by other algorithms.</li>
     * </ul>
     * 
     * <p>
     * To make the example short, the code shown below clusters only 5 documents. Use
     * at least 20 to get reasonable clusters. If you have access to the query that generated
     * the documents being clustered, you should also provide it to Carrot2 to get better clusters.
     * </p>
     * </div>
     * 
     * [[[end:clustering-document-list-intro]]]
     */
    {
        // [[[start:clustering-document-list]]]
        /* A few example documents, normally you would need at least 20 for reasonable clusters. */
        final String [][] data = new String [] []
        {
            {
                "http://en.wikipedia.org/wiki/Data_mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."
            },

            {
                "http://www.ccsu.edu/datamining/resources.html",
                "CCSU - Data Mining",
                "A collection of Data Mining links edited by the Central Connecticut State University ... Graduate Certificate Program. Data Mining Resources. Resources. Groups ..."
            },

            {
                "http://www.kdnuggets.com/",
                "KDnuggets: Data Mining, Web Mining, and Knowledge Discovery",
                "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."
            },

            {
                "http://en.wikipedia.org/wiki/Data-mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Data mining is considered a subfield within the Computer Science field of knowledge discovery. ... claim to perform \"data mining\" by automating the creation ..."
            },

            {
                "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
                "Data Mining: What is Data Mining?",
                "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."
            },
        };

        /* Prepare Carrot2 documents */
        final ArrayList<Document> documents = new ArrayList<Document>();
        for (String [] row : data)
        {
            documents.add(new Document(row[1], row[2], row[0]));
        }

        /* A controller to manage the processing pipeline. */
        final Controller controller = ControllerFactory.createSimple();

        /*
         * Perform clustering by topic using the Lingo algorithm. Lingo can 
         * take advantage of the original query, so we provide it along with the documents.
         */
        final ProcessingResult byTopicClusters = controller.process(documents, "data mining",
            LingoClusteringAlgorithm.class);
        final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
        
        /* Perform clustering by domain. In this case query is not useful, hence it is null. */
        final ProcessingResult byDomainClusters = controller.process(documents, null,
            ByUrlClusteringAlgorithm.class);
        final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
        // [[[end:clustering-document-list]]]
        
        ConsoleFormatter.displayClusters(clustersByTopic);
        ConsoleFormatter.displayClusters(clustersByDomain);
   }
}
 
Example #29
Source File: ClusteringNonEnglishContent.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public static void main(String [] args)
{
    // [[[start:clustering-non-english-content]]]
    /*
     * We use a Controller that reuse instances of Carrot2 processing components 
     * and caches results produced by document sources.
     */
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    /*
     * In the first call, we'll cluster a document list, setting the language for each
     * document separately.
     */
    final List<Document> documents = Lists.newArrayList();
    for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING)
    {
        documents.add(new Document(document.getTitle(), document.getSummary(),
            document.getContentUrl(), LanguageCode.ENGLISH));
    }

    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(documents);
    final ProcessingResult englishResult = controller.process(
        attributes, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(englishResult);

    /*
     * In the second call, we will fetch results for a Chinese query from Bing,
     * setting explicitly the Bing's specific language attribute. Based on that
     * attribute, the document source will set the appropriate language for each
     * document.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
        .market(MarketOption.CHINESE_CHINA);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(attributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here!

    final ProcessingResult chineseResult = controller.process(attributes,
        Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult);

    /*
     * In the third call, we will fetch results for the same Chinese query from
     * Google. As Google document source does not have its specific attribute for
     * setting the language, it will not set the documents' language for us. To make
     * sure the right lexical resources are used, we will need to set the
     * MultilingualClustering.defaultLanguage attribute to Chinese on our own.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    MultilingualClusteringDescriptor.attributeBuilder(attributes)
        .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);

    final ProcessingResult chineseResult2 = controller.process(attributes,
        GoogleDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult2);
    // [[[end:clustering-non-english-content]]]
}