org.carrot2.core.Controller Java Examples

The following examples show how to use org.carrot2.core.Controller. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: UsingCustomLanguageModel.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args)
{
    @SuppressWarnings("unchecked")
    final Controller controller = ControllerFactory
        .createCachingPooling(IDocumentSource.class);

    // We will pass our custom language model element factories classes as a
    // initialization-time attributes. It is preferred to passing them as
    // processing-time attributes because the instances created at initialization
    // time will be reused for all further requests.
    Map<String, Object> attrs = Maps.newHashMap();
    BasicPreprocessingPipelineDescriptor.attributeBuilder(attrs)
        .stemmerFactory(CustomStemmerFactory.class)
        .tokenizerFactory(CustomTokenizerFactory.class)
        .lexicalDataFactory(CustomLexicalDataFactory.class);
    controller.init(attrs);

    // Cluster some data with Lingo and STC. Notice how the cluster quality degrades
    // when the stop word list is empty (especially for STC).
    clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
    clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
}
 
Example #2
Source File: UsingCustomLexicalResources.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args)
{
    @SuppressWarnings("unchecked")
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    // We will pass our custom resource locator at initialization time. There is a
    // variety of implementations of IResourceLocator interface, we will use
    // an explicit filesystem folder in the current working directory.
    File resourcesDir = new File("resources");
    ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));

    Map<String, Object> attrs = Maps.newHashMap();

    // Note that we tell the linguistic component to merge all lexical resources,
    // this is the default setting and it usually helps with multi-lingual content.
    DefaultLexicalDataFactoryDescriptor.attributeBuilder(attrs)
        .mergeResources(true);
    LexicalDataLoaderDescriptor.attributeBuilder(attrs)
        .resourceLookup(resourceLookup);

    controller.init(attrs);

    // Cluster some data with Lingo and STC.
    clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
    clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
}
 
Example #3
Source File: SavingResultsToXml.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serialize(System.out);
    System.out.println();

    // Optionally, we can choose whether we want to serialize documents and clusters
    result.serialize(System.out, 
        false /* don't save documents */,
        true /* save clusters */);
}
 
Example #4
Source File: SavingResultsToJson.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serializeJson(new PrintWriter(System.out));
    System.out.println();

    // Optionally, we can provide a callback for JSON-P-style calls
    result.serializeJson(
        new PrintWriter(System.out), "loadResults",
        true /* indent */, 
        false /* save documents */, 
        true /* save clusters */);

}
 
Example #5
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 6 votes vote down vote up
/**
 * 对所有的PagePOJO进行聚类
 * 
 * @author GS
 * @return
 * @throws IOException
 * @throws Exception
 */
public ProcessingResult cluster(String docPath) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	JsonReader jr = new JsonReader(new File(docPath));
	while (jr.hasNext()) {
		Hit h = jr.next();
		documents.add(new Document(h.getPagePOJO().getTitle(), h
				.getPagePOJO().getContent()));
	}
	jr.close();
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	return englishResult;
}
 
Example #6
Source File: UsingCustomLanguageModel.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
/**
 * Clusters results for query "data mining" and displays the clusters.
 */
private static void clusterAndDisplayClusters(final Controller controller,
    final Class<? extends IClusteringAlgorithm> clusteringAlgorithm)
{
    final Map<String, Object> processingAttributes = Maps.newHashMap();

    CommonAttributesDescriptor.attributeBuilder(processingAttributes)
        .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(processingAttributes,
        clusteringAlgorithm);
    ConsoleFormatter.displayClusters(result.getClusters(), 0);
}
 
Example #7
Source File: UsingCustomLexicalResources.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
/**
 * Clusters results for query "data mining" and displays the clusters.
 */
private static void clusterAndDisplayClusters(final Controller controller,
    final Class<? extends IClusteringAlgorithm> clusteringAlgorithm)
{
    final Map<String, Object> processingAttributes = Maps.newHashMap();

    CommonAttributesDescriptor.attributeBuilder(processingAttributes)
        .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(processingAttributes, 
        clusteringAlgorithm);
    ConsoleFormatter.displayClusters(result.getClusters(), 0);
}
 
Example #8
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #9
Source File: CommitsMessageTopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #10
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 对指定的PagePOJO进行聚类
 * 
 * @author GS
 * @param list
 *            PagePOJO List
 * @return ProcessingResult类,调用需要的方法即可.
 * @throws IOException
 * @throws Exception
 */
public Map<String,List<String>> cluster(List<PagePOJO> list) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	Iterator<PagePOJO> it = list.iterator();
	while (it.hasNext()) {
		PagePOJO pojo = it.next();
		documents.add(new Document(pojo.getTitle(), pojo.getContent(),LanguageCode.CHINESE_SIMPLIFIED));
	}
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	for (org.carrot2.core.Cluster c : englishResult.getClusters()) {
		LinkedList<String> value = new LinkedList<String>(); 
		for (Document d : c.getAllDocuments()) {
			value.add(d.getField(Document.TITLE).toString());
		}
		result.put(c.getLabel(), value);
	}
	return result;
}
 
Example #11
Source File: ClusteringNonEnglishContent.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public static void main(String [] args)
{
    // [[[start:clustering-non-english-content]]]
    /*
     * We use a Controller that reuse instances of Carrot2 processing components 
     * and caches results produced by document sources.
     */
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    /*
     * In the first call, we'll cluster a document list, setting the language for each
     * document separately.
     */
    final List<Document> documents = Lists.newArrayList();
    for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING)
    {
        documents.add(new Document(document.getTitle(), document.getSummary(),
            document.getContentUrl(), LanguageCode.ENGLISH));
    }

    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(documents);
    final ProcessingResult englishResult = controller.process(
        attributes, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(englishResult);

    /*
     * In the second call, we will fetch results for a Chinese query from Bing,
     * setting explicitly the Bing's specific language attribute. Based on that
     * attribute, the document source will set the appropriate language for each
     * document.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
        .market(MarketOption.CHINESE_CHINA);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(attributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here!

    final ProcessingResult chineseResult = controller.process(attributes,
        Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult);

    /*
     * In the third call, we will fetch results for the same Chinese query from
     * Google. As Google document source does not have its specific attribute for
     * setting the language, it will not set the documents' language for us. To make
     * sure the right lexical resources are used, we will need to set the
     * MultilingualClustering.defaultLanguage attribute to Chinese on our own.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    MultilingualClusteringDescriptor.attributeBuilder(attributes)
        .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);

    final ProcessingResult chineseResult2 = controller.process(attributes,
        GoogleDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult2);
    // [[[end:clustering-non-english-content]]]
}
 
Example #12
Source File: UsingCachingController.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings(
{
    "unused", "unchecked"
})
public static void main(String [] args)
{
    // [[[start:using-caching-controller]]]
    /*
     * Create the caching controller. You need only one caching controller instance
     * per application life cycle. This controller instance will cache the results
     * fetched from any document source and also clusters generated by the Lingo
     * algorithm.
     */
    final Controller controller = ControllerFactory.createCachingPooling(
        IDocumentSource.class, LingoClusteringAlgorithm.class);

    /*
     * Before using the caching controller, you must initialize it. On initialization,
     * you can set default values for some attributes. In this example, we'll set the
     * default results number to 50 and the API key.
     */
    final Map<String, Object> globalAttributes = new HashMap<String, Object>();
    CommonAttributesDescriptor
        .attributeBuilder(globalAttributes)
            .results(50);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(globalAttributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here
    controller.init(globalAttributes);

    /*
     * The controller is now ready to perform queries. To show that the documents from
     * the document input are cached, we will perform the same query twice and measure
     * the time for each query.
     */
    ProcessingResult result;
    long start, duration;

    final Map<String, Object> attributes;
    attributes = new HashMap<String, Object>();
    CommonAttributesDescriptor.attributeBuilder(attributes).query("data mining");

    start = System.currentTimeMillis();
    result = controller.process(attributes, Bing3WebDocumentSource.class,
        LingoClusteringAlgorithm.class);
    duration = System.currentTimeMillis() - start;
    System.out.println(duration + " ms (empty cache)");

    start = System.currentTimeMillis();
    result = controller.process(attributes, Bing3WebDocumentSource.class,
        LingoClusteringAlgorithm.class);
    duration = System.currentTimeMillis() - start;
    System.out.println(duration + " ms (documents and clusters from cache)");
    // [[[end:using-caching-controller]]]
}
 
Example #13
Source File: UsingComponentSuites.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args) throws Exception
{
    @SuppressWarnings("unchecked")
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    // Initialization-time attributes that will apply to all components.
    final Map<String, Object> initAttributes = Maps.newHashMap();

    // Prepare resource lookup facade. We will use the suites directory 
    // and class path resources.
    final ResourceLookup resourceLookup = new ResourceLookup(
        new DirLocator(new File("suites")),
        new ContextClassLoaderLocator());

    // We know we'll be using Bing so set up its access key.
    // use your own ID here!
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(initAttributes)
            .appid(BingKeyAccess.getKey());
    
    // We'll read the component suite definition from an XML stream.
    // IResource is an abstraction layer over resources in Carrot2.
    IResource suiteXml = resourceLookup.getFirst("suite-examples.xml");

    // Deserialize the component suite definition.
    final ProcessingComponentSuite suite = 
        ProcessingComponentSuite.deserialize(suiteXml, resourceLookup);

    // Initialize the controller with the suite. All components from the suite
    // will be available for processing within this controller.
    controller.init(initAttributes, suite.getComponentConfigurations());

    // From the suite definition, you can get the document sources and clustering
    // algorithm descriptors.
    final List<DocumentSourceDescriptor> sources = suite.getSources();
    final List<String> sourceIds = Lists.transform(sources,
        ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
    System.out.println("Found " + sourceIds.size() + " document sources: "
        + sourceIds);

    final List<ProcessingComponentDescriptor> algorithms = suite.getAlgorithms();
    final List<String> algorithmIds = Lists.transform(algorithms,
        ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE);
    System.out.println("Found " + algorithmIds.size() + " clutering algorithms: "
        + algorithmIds + "\n\n");

    // Run not more than two algorithms on not more than two sources
    for (int s = 0; s < Math.min(sourceIds.size(), 2); s++)
    {
        for (int a = 0; a < Math.min(algorithmIds.size(), 2); a++)
        {
            // You can retrieve some metadata about the components, such as
            // human-readable label, from their descriptors.
            System.out.println("Querying " + sources.get(s).getLabel()
                + ", clustering with " + algorithms.get(a).getLabel());

            // As usual, we pass attributes for processing
            final Map<String, Object> attributes = Maps.newHashMap();
            CommonAttributesDescriptor.attributeBuilder(attributes)
                .query("data mining");

            // Pass component ids to the controller to perform processing
            final ProcessingResult result = controller.process(attributes,
                sourceIds.get(s), algorithmIds.get(a));
            ConsoleFormatter.displayClusters(result.getClusters());
            System.out.println();
        }
    }
}
 
Example #14
Source File: ClusteringDocumentList.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args)
{
    /* [[[start:clustering-document-list-intro]]]
     * 
     * <div>
     * <p>
     * The easiest way to get started with Carrot2 is to cluster a collection
     * of {@link org.carrot2.core.Document}s. Each document can consist of:
     * </p>
     * 
     * <ul>
     * <li>document content: a query-in-context snippet, document abstract or full text,</li>
     * <li>document title: optional, some clustering algorithms give more weight to document titles,</li>
     * <li>document URL: optional, used by the {@link org.carrot2.clustering.synthetic.ByUrlClusteringAlgorithm}, 
     * ignored by other algorithms.</li>
     * </ul>
     * 
     * <p>
     * To make the example short, the code shown below clusters only 5 documents. Use
     * at least 20 to get reasonable clusters. If you have access to the query that generated
     * the documents being clustered, you should also provide it to Carrot2 to get better clusters.
     * </p>
     * </div>
     * 
     * [[[end:clustering-document-list-intro]]]
     */
    {
        // [[[start:clustering-document-list]]]
        /* A few example documents, normally you would need at least 20 for reasonable clusters. */
        final String [][] data = new String [] []
        {
            {
                "http://en.wikipedia.org/wiki/Data_mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."
            },

            {
                "http://www.ccsu.edu/datamining/resources.html",
                "CCSU - Data Mining",
                "A collection of Data Mining links edited by the Central Connecticut State University ... Graduate Certificate Program. Data Mining Resources. Resources. Groups ..."
            },

            {
                "http://www.kdnuggets.com/",
                "KDnuggets: Data Mining, Web Mining, and Knowledge Discovery",
                "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."
            },

            {
                "http://en.wikipedia.org/wiki/Data-mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Data mining is considered a subfield within the Computer Science field of knowledge discovery. ... claim to perform \"data mining\" by automating the creation ..."
            },

            {
                "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
                "Data Mining: What is Data Mining?",
                "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."
            },
        };

        /* Prepare Carrot2 documents */
        final ArrayList<Document> documents = new ArrayList<Document>();
        for (String [] row : data)
        {
            documents.add(new Document(row[1], row[2], row[0]));
        }

        /* A controller to manage the processing pipeline. */
        final Controller controller = ControllerFactory.createSimple();

        /*
         * Perform clustering by topic using the Lingo algorithm. Lingo can 
         * take advantage of the original query, so we provide it along with the documents.
         */
        final ProcessingResult byTopicClusters = controller.process(documents, "data mining",
            LingoClusteringAlgorithm.class);
        final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
        
        /* Perform clustering by domain. In this case query is not useful, hence it is null. */
        final ProcessingResult byDomainClusters = controller.process(documents, null,
            ByUrlClusteringAlgorithm.class);
        final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
        // [[[end:clustering-document-list]]]
        
        ConsoleFormatter.displayClusters(clustersByTopic);
        ConsoleFormatter.displayClusters(clustersByDomain);
   }
}
 
Example #15
Source File: LoadingAttributeValuesFromXml.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args) throws Exception
{
    InputStream xmlStream = null;
    try
    {
        xmlStream = LoadingAttributeValuesFromXml.class
            .getResourceAsStream("algorithm-lingo-attributes.xml");

        // Load attribute value sets from the XML stream
        final AttributeValueSets attributeValueSets = AttributeValueSets
            .deserialize(xmlStream);

        // Get the desired set of attribute values for use with further processing
        final Map<String, Object> defaultAttributes = attributeValueSets
            .getDefaultAttributeValueSet().getAttributeValues();

        final Map<String, Object> fasterClusteringAttributes = attributeValueSets
            .getAttributeValueSet("faster-clustering").getAttributeValues();

        // Perform processing using the attribute values
        final Controller controller = ControllerFactory.createSimple();

        // Initialize the controller with one attribute set
        controller.init(fasterClusteringAttributes);

        // Perform clustering using the attribute set provided at initialization time
        Map<String, Object> requestAttributes = Maps.newHashMap(); 
        CommonAttributesDescriptor.attributeBuilder(requestAttributes)
            .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
            .query("data mining");
        ProcessingResult results = controller.process(requestAttributes, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayClusters(results.getClusters());

        // Perform clustering using some other attribute set, in this case the
        // one that is the default in the XML file.
        requestAttributes =
            CommonAttributesDescriptor.attributeBuilder(Maps.newHashMap(defaultAttributes))
                .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
                .query("data mining").map;

        results = controller.process(requestAttributes, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayClusters(results.getClusters());
    }
    finally
    {
        CloseableUtils.close(xmlStream);
    }
}