org.datavec.spark.transform.AnalyzeSpark Java Examples
The following examples show how to use
org.datavec.spark.transform.AnalyzeSpark.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestAnalysis.java From DataVec with Apache License 2.0 | 5 votes |
@Test public void testSampleMostFrequent() { List<List<Writable>> toParallelize = new ArrayList<>(); toParallelize.add(Arrays.<Writable>asList(new Text("a"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("b"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("d"), new Text("0"))); toParallelize.add(Arrays.<Writable>asList(new Text("e"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("f"), new Text("ThirdMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("h"), new Text("1"))); toParallelize.add(Arrays.<Writable>asList(new Text("i"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("j"), new Text("2"))); toParallelize.add(Arrays.<Writable>asList(new Text("k"), new Text("ThirdMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("l"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("m"), new Text("3"))); toParallelize.add(Arrays.<Writable>asList(new Text("n"), new Text("4"))); toParallelize.add(Arrays.<Writable>asList(new Text("o"), new Text("5"))); JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize); Schema schema = new Schema.Builder().addColumnsString("irrelevant", "column").build(); Map<Writable, Long> map = AnalyzeSpark.sampleMostFrequentFromColumn(3, "column", schema, rdd); // System.out.println(map); assertEquals(3, map.size()); assertEquals(4L, (long) map.get(new Text("MostCommon"))); assertEquals(3L, (long) map.get(new Text("SecondMostCommon"))); assertEquals(2L, (long) map.get(new Text("ThirdMostCommon"))); }
Example #2
Source File: TestAnalysis.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testSampleMostFrequent() { List<List<Writable>> toParallelize = new ArrayList<>(); toParallelize.add(Arrays.<Writable>asList(new Text("a"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("b"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("d"), new Text("0"))); toParallelize.add(Arrays.<Writable>asList(new Text("e"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("f"), new Text("ThirdMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("h"), new Text("1"))); toParallelize.add(Arrays.<Writable>asList(new Text("i"), new Text("SecondMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("j"), new Text("2"))); toParallelize.add(Arrays.<Writable>asList(new Text("k"), new Text("ThirdMostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("l"), new Text("MostCommon"))); toParallelize.add(Arrays.<Writable>asList(new Text("m"), new Text("3"))); toParallelize.add(Arrays.<Writable>asList(new Text("n"), new Text("4"))); toParallelize.add(Arrays.<Writable>asList(new Text("o"), new Text("5"))); JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize); Schema schema = new Schema.Builder().addColumnsString("irrelevant", "column").build(); Map<Writable, Long> map = AnalyzeSpark.sampleMostFrequentFromColumn(3, "column", schema, rdd); // System.out.println(map); assertEquals(3, map.size()); assertEquals(4L, (long) map.get(new Text("MostCommon"))); assertEquals(3L, (long) map.get(new Text("SecondMostCommon"))); assertEquals(2L, (long) map.get(new Text("ThirdMostCommon"))); }
Example #3
Source File: TestAnalysis.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testAnalysisAllColTypes(){ Schema s = new Schema.Builder() .addColumn(new BinaryMetaData("binary")) .addColumn(new BooleanMetaData("boolean")) .addColumnCategorical("categorical", "a", "b") .addColumnDouble("double") .addColumnFloat("float") .addColumnInteger("integer") .addColumnLong("long") .addColumnNDArray("ndarray", new long[]{1,4}) .addColumnString("string") .addColumnTime("time", TimeZone.getDefault()) .build(); List<List<Writable>> data = Arrays.asList( Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(true), new Text("a"), new DoubleWritable(1.0), new FloatWritable(1.0f), new IntWritable(1), new LongWritable(1L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text"), new LongWritable(100L)), Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(false), new Text("b"), new DoubleWritable(0.0), new FloatWritable(0.0f), new IntWritable(0), new LongWritable(0L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text2"), new LongWritable(101L))); JavaRDD<List<Writable>> rdd = sc.parallelize(data); DataAnalysis da = AnalyzeSpark.analyze(s, rdd); // System.out.println(da); da.toString(); da.toJson(); }
Example #4
Source File: TestAnalysis.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testAnalysisVsLocal() throws Exception { Schema s = new Schema.Builder() .addColumnsDouble("%d", 0, 3) .addColumnInteger("label") .build(); RecordReader rr = new CSVRecordReader(); rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile())); List<List<Writable>> toParallelize = new ArrayList<>(); while(rr.hasNext()){ toParallelize.add(rr.next()); } JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize).coalesce(1); rr.reset(); DataAnalysis local = AnalyzeLocal.analyze(s, rr); DataAnalysis spark = AnalyzeSpark.analyze(s, rdd); // assertEquals(local.toJson(), spark.toJson()); assertEquals(local, spark); //Also quality analysis: rr.reset(); DataQualityAnalysis localQ = AnalyzeLocal.analyzeQuality(s, rr); DataQualityAnalysis sparkQ = AnalyzeSpark.analyzeQuality(s, rdd); assertEquals(localQ, sparkQ); //And, check unique etc: rr.reset(); Map<String,Set<Writable>> mapLocal = AnalyzeLocal.getUnique(s.getColumnNames(), s, rr); Map<String,List<Writable>> mapSpark = AnalyzeSpark.getUnique(s.getColumnNames(), s, rdd); assertEquals(mapLocal.keySet(), mapSpark.keySet()); for( String k : mapLocal.keySet()){ assertEquals(mapLocal.get(k), new HashSet<Writable>(mapSpark.get(k))); } }