Java Code Examples for org.carrot2.core.Controller#process()
The following examples show how to use
org.carrot2.core.Controller#process() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SavingResultsToXml.java From scava with Eclipse Public License 2.0 | 6 votes |
public static void main(String [] args) throws Exception { // Let's fetch some results from MSN first final Controller controller = ControllerFactory.createSimple(); final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes) .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining"); final ProcessingResult result = controller.process(attributes, LingoClusteringAlgorithm.class); // Now, we can serialize the entire result to XML like this result.serialize(System.out); System.out.println(); // Optionally, we can choose whether we want to serialize documents and clusters result.serialize(System.out, false /* don't save documents */, true /* save clusters */); }
Example 2
Source File: SavingResultsToJson.java From scava with Eclipse Public License 2.0 | 6 votes |
public static void main(String [] args) throws Exception { // Let's fetch some results from MSN first final Controller controller = ControllerFactory.createSimple(); final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes) .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining"); final ProcessingResult result = controller.process(attributes, LingoClusteringAlgorithm.class); // Now, we can serialize the entire result to XML like this result.serializeJson(new PrintWriter(System.out)); System.out.println(); // Optionally, we can provide a callback for JSON-P-style calls result.serializeJson( new PrintWriter(System.out), "loadResults", true /* indent */, false /* save documents */, true /* save clusters */); }
Example 3
Source File: Cluster.java From DistributedCrawler with Apache License 2.0 | 6 votes |
/** * 对所有的PagePOJO进行聚类 * * @author GS * @return * @throws IOException * @throws Exception */ public ProcessingResult cluster(String docPath) throws IOException, Exception { @SuppressWarnings("unchecked") final Controller controller = ControllerFactory .createCachingPooling(IDocumentSource.class); final List<Document> documents = Lists.newArrayList(); JsonReader jr = new JsonReader(new File(docPath)); while (jr.hasNext()) { Hit h = jr.next(); documents.add(new Document(h.getPagePOJO().getTitle(), h .getPagePOJO().getContent())); } jr.close(); final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes).documents( documents); final ProcessingResult englishResult = controller.process(attributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(englishResult);// 展示 return englishResult; }
Example 4
Source File: UsingCustomLanguageModel.java From scava with Eclipse Public License 2.0 | 5 votes |
/** * Clusters results for query "data mining" and displays the clusters. */ private static void clusterAndDisplayClusters(final Controller controller, final Class<? extends IClusteringAlgorithm> clusteringAlgorithm) { final Map<String, Object> processingAttributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(processingAttributes) .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining"); final ProcessingResult result = controller.process(processingAttributes, clusteringAlgorithm); ConsoleFormatter.displayClusters(result.getClusters(), 0); }
Example 5
Source File: UsingCustomLexicalResources.java From scava with Eclipse Public License 2.0 | 5 votes |
/** * Clusters results for query "data mining" and displays the clusters. */ private static void clusterAndDisplayClusters(final Controller controller, final Class<? extends IClusteringAlgorithm> clusteringAlgorithm) { final Map<String, Object> processingAttributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(processingAttributes) .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining"); final ProcessingResult result = controller.process(processingAttributes, clusteringAlgorithm); ConsoleFormatter.displayClusters(result.getClusters(), 0); }
Example 6
Source File: TopicsTransMetricProvider.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<Cluster> produceTopics(ArrayList<Document> documents) { /* A controller to manage the processing pipeline. */ final Controller controller = ControllerFactory.createSimple(); /* * Perform clustering by topic using the Lingo algorithm. Lingo can take * advantage of the original query, so we provide it along with the documents. */ final ProcessingResult byTopicClusters = controller.process(documents, null, LingoClusteringAlgorithm.class); final List<Cluster> clustersByTopic = byTopicClusters.getClusters(); return clustersByTopic; }
Example 7
Source File: CommitsMessageTopicsTransMetricProvider.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<Cluster> produceTopics(ArrayList<Document> documents) { /* A controller to manage the processing pipeline. */ final Controller controller = ControllerFactory.createSimple(); /* * Perform clustering by topic using the Lingo algorithm. Lingo can take * advantage of the original query, so we provide it along with the documents. */ final ProcessingResult byTopicClusters = controller.process(documents, null, LingoClusteringAlgorithm.class); final List<Cluster> clustersByTopic = byTopicClusters.getClusters(); return clustersByTopic; }
Example 8
Source File: Cluster.java From DistributedCrawler with Apache License 2.0 | 5 votes |
/** * 对指定的PagePOJO进行聚类 * * @author GS * @param list * PagePOJO List * @return ProcessingResult类,调用需要的方法即可. * @throws IOException * @throws Exception */ public Map<String,List<String>> cluster(List<PagePOJO> list) throws IOException, Exception { @SuppressWarnings("unchecked") final Controller controller = ControllerFactory .createCachingPooling(IDocumentSource.class); final List<Document> documents = Lists.newArrayList(); Iterator<PagePOJO> it = list.iterator(); while (it.hasNext()) { PagePOJO pojo = it.next(); documents.add(new Document(pojo.getTitle(), pojo.getContent(),LanguageCode.CHINESE_SIMPLIFIED)); } final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes).documents( documents); final ProcessingResult englishResult = controller.process(attributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(englishResult);// 展示 for (org.carrot2.core.Cluster c : englishResult.getClusters()) { LinkedList<String> value = new LinkedList<String>(); for (Document d : c.getAllDocuments()) { value.add(d.getField(Document.TITLE).toString()); } result.put(c.getLabel(), value); } return result; }
Example 9
Source File: ClusteringNonEnglishContent.java From scava with Eclipse Public License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public static void main(String [] args) { // [[[start:clustering-non-english-content]]] /* * We use a Controller that reuse instances of Carrot2 processing components * and caches results produced by document sources. */ final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); /* * In the first call, we'll cluster a document list, setting the language for each * document separately. */ final List<Document> documents = Lists.newArrayList(); for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING) { documents.add(new Document(document.getTitle(), document.getSummary(), document.getContentUrl(), LanguageCode.ENGLISH)); } final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes) .documents(documents); final ProcessingResult englishResult = controller.process( attributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(englishResult); /* * In the second call, we will fetch results for a Chinese query from Bing, * setting explicitly the Bing's specific language attribute. Based on that * attribute, the document source will set the appropriate language for each * document. */ attributes.clear(); CommonAttributesDescriptor.attributeBuilder(attributes) .query("聚类" /* clustering? */) .results(100); Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes) .market(MarketOption.CHINESE_CHINA); Bing3WebDocumentSourceDescriptor .attributeBuilder(attributes) .appid(BingKeyAccess.getKey()); // use your own ID here! final ProcessingResult chineseResult = controller.process(attributes, Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(chineseResult); /* * In the third call, we will fetch results for the same Chinese query from * Google. As Google document source does not have its specific attribute for * setting the language, it will not set the documents' language for us. To make * sure the right lexical resources are used, we will need to set the * MultilingualClustering.defaultLanguage attribute to Chinese on our own. */ attributes.clear(); CommonAttributesDescriptor.attributeBuilder(attributes) .query("聚类" /* clustering? */) .results(100); MultilingualClusteringDescriptor.attributeBuilder(attributes) .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED); final ProcessingResult chineseResult2 = controller.process(attributes, GoogleDocumentSource.class, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(chineseResult2); // [[[end:clustering-non-english-content]]] }
Example 10
Source File: UsingCachingController.java From scava with Eclipse Public License 2.0 | 4 votes |
@SuppressWarnings( { "unused", "unchecked" }) public static void main(String [] args) { // [[[start:using-caching-controller]]] /* * Create the caching controller. You need only one caching controller instance * per application life cycle. This controller instance will cache the results * fetched from any document source and also clusters generated by the Lingo * algorithm. */ final Controller controller = ControllerFactory.createCachingPooling( IDocumentSource.class, LingoClusteringAlgorithm.class); /* * Before using the caching controller, you must initialize it. On initialization, * you can set default values for some attributes. In this example, we'll set the * default results number to 50 and the API key. */ final Map<String, Object> globalAttributes = new HashMap<String, Object>(); CommonAttributesDescriptor .attributeBuilder(globalAttributes) .results(50); Bing3WebDocumentSourceDescriptor .attributeBuilder(globalAttributes) .appid(BingKeyAccess.getKey()); // use your own ID here controller.init(globalAttributes); /* * The controller is now ready to perform queries. To show that the documents from * the document input are cached, we will perform the same query twice and measure * the time for each query. */ ProcessingResult result; long start, duration; final Map<String, Object> attributes; attributes = new HashMap<String, Object>(); CommonAttributesDescriptor.attributeBuilder(attributes).query("data mining"); start = System.currentTimeMillis(); result = controller.process(attributes, Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class); duration = System.currentTimeMillis() - start; System.out.println(duration + " ms (empty cache)"); start = System.currentTimeMillis(); result = controller.process(attributes, Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class); duration = System.currentTimeMillis() - start; System.out.println(duration + " ms (documents and clusters from cache)"); // [[[end:using-caching-controller]]] }
Example 11
Source File: UsingComponentSuites.java From scava with Eclipse Public License 2.0 | 4 votes |
public static void main(String [] args) throws Exception { @SuppressWarnings("unchecked") final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); // Initialization-time attributes that will apply to all components. final Map<String, Object> initAttributes = Maps.newHashMap(); // Prepare resource lookup facade. We will use the suites directory // and class path resources. final ResourceLookup resourceLookup = new ResourceLookup( new DirLocator(new File("suites")), new ContextClassLoaderLocator()); // We know we'll be using Bing so set up its access key. // use your own ID here! Bing3WebDocumentSourceDescriptor .attributeBuilder(initAttributes) .appid(BingKeyAccess.getKey()); // We'll read the component suite definition from an XML stream. // IResource is an abstraction layer over resources in Carrot2. IResource suiteXml = resourceLookup.getFirst("suite-examples.xml"); // Deserialize the component suite definition. final ProcessingComponentSuite suite = ProcessingComponentSuite.deserialize(suiteXml, resourceLookup); // Initialize the controller with the suite. All components from the suite // will be available for processing within this controller. controller.init(initAttributes, suite.getComponentConfigurations()); // From the suite definition, you can get the document sources and clustering // algorithm descriptors. final List<DocumentSourceDescriptor> sources = suite.getSources(); final List<String> sourceIds = Lists.transform(sources, ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE); System.out.println("Found " + sourceIds.size() + " document sources: " + sourceIds); final List<ProcessingComponentDescriptor> algorithms = suite.getAlgorithms(); final List<String> algorithmIds = Lists.transform(algorithms, ProcessingComponentDescriptor.ProcessingComponentDescriptorToId.INSTANCE); System.out.println("Found " + algorithmIds.size() + " clutering algorithms: " + algorithmIds + "\n\n"); // Run not more than two algorithms on not more than two sources for (int s = 0; s < Math.min(sourceIds.size(), 2); s++) { for (int a = 0; a < Math.min(algorithmIds.size(), 2); a++) { // You can retrieve some metadata about the components, such as // human-readable label, from their descriptors. System.out.println("Querying " + sources.get(s).getLabel() + ", clustering with " + algorithms.get(a).getLabel()); // As usual, we pass attributes for processing final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes) .query("data mining"); // Pass component ids to the controller to perform processing final ProcessingResult result = controller.process(attributes, sourceIds.get(s), algorithmIds.get(a)); ConsoleFormatter.displayClusters(result.getClusters()); System.out.println(); } } }
Example 12
Source File: ClusteringDocumentList.java From scava with Eclipse Public License 2.0 | 4 votes |
public static void main(String [] args) { /* [[[start:clustering-document-list-intro]]] * * <div> * <p> * The easiest way to get started with Carrot2 is to cluster a collection * of {@link org.carrot2.core.Document}s. Each document can consist of: * </p> * * <ul> * <li>document content: a query-in-context snippet, document abstract or full text,</li> * <li>document title: optional, some clustering algorithms give more weight to document titles,</li> * <li>document URL: optional, used by the {@link org.carrot2.clustering.synthetic.ByUrlClusteringAlgorithm}, * ignored by other algorithms.</li> * </ul> * * <p> * To make the example short, the code shown below clusters only 5 documents. Use * at least 20 to get reasonable clusters. If you have access to the query that generated * the documents being clustered, you should also provide it to Carrot2 to get better clusters. * </p> * </div> * * [[[end:clustering-document-list-intro]]] */ { // [[[start:clustering-document-list]]] /* A few example documents, normally you would need at least 20 for reasonable clusters. */ final String [][] data = new String [] [] { { "http://en.wikipedia.org/wiki/Data_mining", "Data mining - Wikipedia, the free encyclopedia", "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns." }, { "http://www.ccsu.edu/datamining/resources.html", "CCSU - Data Mining", "A collection of Data Mining links edited by the Central Connecticut State University ... Graduate Certificate Program. Data Mining Resources. Resources. Groups ..." }, { "http://www.kdnuggets.com/", "KDnuggets: Data Mining, Web Mining, and Knowledge Discovery", "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings." }, { "http://en.wikipedia.org/wiki/Data-mining", "Data mining - Wikipedia, the free encyclopedia", "Data mining is considered a subfield within the Computer Science field of knowledge discovery. ... claim to perform \"data mining\" by automating the creation ..." }, { "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm", "Data Mining: What is Data Mining?", "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works." }, }; /* Prepare Carrot2 documents */ final ArrayList<Document> documents = new ArrayList<Document>(); for (String [] row : data) { documents.add(new Document(row[1], row[2], row[0])); } /* A controller to manage the processing pipeline. */ final Controller controller = ControllerFactory.createSimple(); /* * Perform clustering by topic using the Lingo algorithm. Lingo can * take advantage of the original query, so we provide it along with the documents. */ final ProcessingResult byTopicClusters = controller.process(documents, "data mining", LingoClusteringAlgorithm.class); final List<Cluster> clustersByTopic = byTopicClusters.getClusters(); /* Perform clustering by domain. In this case query is not useful, hence it is null. */ final ProcessingResult byDomainClusters = controller.process(documents, null, ByUrlClusteringAlgorithm.class); final List<Cluster> clustersByDomain = byDomainClusters.getClusters(); // [[[end:clustering-document-list]]] ConsoleFormatter.displayClusters(clustersByTopic); ConsoleFormatter.displayClusters(clustersByDomain); } }
Example 13
Source File: LoadingAttributeValuesFromXml.java From scava with Eclipse Public License 2.0 | 4 votes |
public static void main(String [] args) throws Exception { InputStream xmlStream = null; try { xmlStream = LoadingAttributeValuesFromXml.class .getResourceAsStream("algorithm-lingo-attributes.xml"); // Load attribute value sets from the XML stream final AttributeValueSets attributeValueSets = AttributeValueSets .deserialize(xmlStream); // Get the desired set of attribute values for use with further processing final Map<String, Object> defaultAttributes = attributeValueSets .getDefaultAttributeValueSet().getAttributeValues(); final Map<String, Object> fasterClusteringAttributes = attributeValueSets .getAttributeValueSet("faster-clustering").getAttributeValues(); // Perform processing using the attribute values final Controller controller = ControllerFactory.createSimple(); // Initialize the controller with one attribute set controller.init(fasterClusteringAttributes); // Perform clustering using the attribute set provided at initialization time Map<String, Object> requestAttributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(requestAttributes) .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining"); ProcessingResult results = controller.process(requestAttributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayClusters(results.getClusters()); // Perform clustering using some other attribute set, in this case the // one that is the default in the XML file. requestAttributes = CommonAttributesDescriptor.attributeBuilder(Maps.newHashMap(defaultAttributes)) .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING)) .query("data mining").map; results = controller.process(requestAttributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayClusters(results.getClusters()); } finally { CloseableUtils.close(xmlStream); } }