org.datavec.spark.transform.AnalyzeSpark Java Examples

The following examples show how to use org.datavec.spark.transform.AnalyzeSpark. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestAnalysis.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testSampleMostFrequent() {

    List<List<Writable>> toParallelize = new ArrayList<>();
    toParallelize.add(Arrays.<Writable>asList(new Text("a"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("b"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("d"), new Text("0")));
    toParallelize.add(Arrays.<Writable>asList(new Text("e"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("f"), new Text("ThirdMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("h"), new Text("1")));
    toParallelize.add(Arrays.<Writable>asList(new Text("i"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("j"), new Text("2")));
    toParallelize.add(Arrays.<Writable>asList(new Text("k"), new Text("ThirdMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("l"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("m"), new Text("3")));
    toParallelize.add(Arrays.<Writable>asList(new Text("n"), new Text("4")));
    toParallelize.add(Arrays.<Writable>asList(new Text("o"), new Text("5")));


    JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize);

    Schema schema = new Schema.Builder().addColumnsString("irrelevant", "column").build();

    Map<Writable, Long> map = AnalyzeSpark.sampleMostFrequentFromColumn(3, "column", schema, rdd);

    //        System.out.println(map);

    assertEquals(3, map.size());
    assertEquals(4L, (long) map.get(new Text("MostCommon")));
    assertEquals(3L, (long) map.get(new Text("SecondMostCommon")));
    assertEquals(2L, (long) map.get(new Text("ThirdMostCommon")));
}
 
Example #2
Source File: TestAnalysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testSampleMostFrequent() {

    List<List<Writable>> toParallelize = new ArrayList<>();
    toParallelize.add(Arrays.<Writable>asList(new Text("a"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("b"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("d"), new Text("0")));
    toParallelize.add(Arrays.<Writable>asList(new Text("e"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("f"), new Text("ThirdMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("c"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("h"), new Text("1")));
    toParallelize.add(Arrays.<Writable>asList(new Text("i"), new Text("SecondMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("j"), new Text("2")));
    toParallelize.add(Arrays.<Writable>asList(new Text("k"), new Text("ThirdMostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("l"), new Text("MostCommon")));
    toParallelize.add(Arrays.<Writable>asList(new Text("m"), new Text("3")));
    toParallelize.add(Arrays.<Writable>asList(new Text("n"), new Text("4")));
    toParallelize.add(Arrays.<Writable>asList(new Text("o"), new Text("5")));


    JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize);

    Schema schema = new Schema.Builder().addColumnsString("irrelevant", "column").build();

    Map<Writable, Long> map = AnalyzeSpark.sampleMostFrequentFromColumn(3, "column", schema, rdd);

    //        System.out.println(map);

    assertEquals(3, map.size());
    assertEquals(4L, (long) map.get(new Text("MostCommon")));
    assertEquals(3L, (long) map.get(new Text("SecondMostCommon")));
    assertEquals(2L, (long) map.get(new Text("ThirdMostCommon")));
}
 
Example #3
Source File: TestAnalysis.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
    public void testAnalysisAllColTypes(){

        Schema s = new Schema.Builder()
                .addColumn(new BinaryMetaData("binary"))
                .addColumn(new BooleanMetaData("boolean"))
                .addColumnCategorical("categorical", "a", "b")
                .addColumnDouble("double")
                .addColumnFloat("float")
                .addColumnInteger("integer")
                .addColumnLong("long")
                .addColumnNDArray("ndarray", new long[]{1,4})
                .addColumnString("string")
                .addColumnTime("time", TimeZone.getDefault())
                .build();

        List<List<Writable>> data = Arrays.asList(
                Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(true), new Text("a"),
                        new DoubleWritable(1.0), new FloatWritable(1.0f), new IntWritable(1),
                        new LongWritable(1L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text"),
                        new LongWritable(100L)),
                Arrays.asList(new BytesWritable(new byte[3]), new BooleanWritable(false), new Text("b"),
                        new DoubleWritable(0.0), new FloatWritable(0.0f), new IntWritable(0),
                        new LongWritable(0L), new NDArrayWritable(Nd4j.create(DataType.FLOAT, 1, 4)), new Text("text2"),
                        new LongWritable(101L)));

        JavaRDD<List<Writable>> rdd = sc.parallelize(data);
        DataAnalysis da = AnalyzeSpark.analyze(s, rdd);
//        System.out.println(da);
        da.toString();
        da.toJson();
    }
 
Example #4
Source File: TestAnalysis.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testAnalysisVsLocal() throws Exception {

        Schema s = new Schema.Builder()
                .addColumnsDouble("%d", 0, 3)
                .addColumnInteger("label")
                .build();

        RecordReader rr = new CSVRecordReader();
        rr.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile()));

        List<List<Writable>> toParallelize = new ArrayList<>();
        while(rr.hasNext()){
            toParallelize.add(rr.next());
        }

        JavaRDD<List<Writable>> rdd = sc.parallelize(toParallelize).coalesce(1);


        rr.reset();
        DataAnalysis local = AnalyzeLocal.analyze(s, rr);
        DataAnalysis spark = AnalyzeSpark.analyze(s, rdd);

//        assertEquals(local.toJson(), spark.toJson());
        assertEquals(local, spark);


        //Also quality analysis:
        rr.reset();
        DataQualityAnalysis localQ = AnalyzeLocal.analyzeQuality(s, rr);
        DataQualityAnalysis sparkQ = AnalyzeSpark.analyzeQuality(s, rdd);

        assertEquals(localQ, sparkQ);


        //And, check unique etc:
        rr.reset();
        Map<String,Set<Writable>> mapLocal = AnalyzeLocal.getUnique(s.getColumnNames(), s, rr);
        Map<String,List<Writable>> mapSpark = AnalyzeSpark.getUnique(s.getColumnNames(), s, rdd);

        assertEquals(mapLocal.keySet(), mapSpark.keySet());
        for( String k : mapLocal.keySet()){
            assertEquals(mapLocal.get(k), new HashSet<Writable>(mapSpark.get(k)));
        }
    }