Java Code Examples for org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#setTokenPreProcessor()

The following examples show how to use org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory#setTokenPreProcessor() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ManualTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
Example 2
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
Example 3
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
Example 4
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example 5
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
            .set("spark.driver.host", "localhost")
                    .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
                    .set("spark.executor.memory", "8g");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    //String dataPath = Resources.asFile("big/raw_sentences.txt").getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .useUnknown(true).build();

    word2Vec.train(corpus);


    sc.stop();

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
 
Example 6
Source File: Word2VecRawTextExample.java    From Java-Data-Science-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
Example 7
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWord2VecCBOW() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(4)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
 
Example 8
Source File: Word2VecModelExample.java    From Java-Deep-Learning-Cookbook with MIT License 5 votes vote down vote up
public static void main(String[] args) throws Exception {
    final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile());
    SentenceDataPreProcessor.setPreprocessor(iterator);
    final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
    tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor());

    final Word2Vec model = new Word2Vec.Builder()
                                    .iterate(iterator)
                                    .tokenizerFactory(tokenizerFactory)
                                    .minWordFrequency(5)
                                    .layerSize(100)
                                    .seed(42)
                                    .epochs(50)
                                    .windowSize(5)
                                    .build();
    log.info("Fitting Word2Vec model....");
    model.fit();

    final Collection<String> words = model.wordsNearest("season",10);
    for(final String word: words){
        System.out.println(word+ " ");
    }
    final double cosSimilarity = model.similarity("season","program");
    System.out.println(cosSimilarity);

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
            .setMaxIter(100)
            .theta(0.5)
            .normalize(false)
            .learningRate(500)
            .useAdaGrad(false)
            .build();


    //save word vectors for tSNE visualization.
    WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt"));
    WordVectorSerializer.writeWord2VecModel(model, "model.zip");

}
 
Example 9
Source File: Word2VecDataSetIteratorTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }
 
Example 10
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testWordsNearestSum() throws IOException {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("Load & Vectorize Sentences....");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    log.info("Building model....");
    Word2Vec vec = new Word2Vec.Builder()
            .minWordFrequency(5)
            .iterations(1)
            .layerSize(100)
            .seed(42)
            .windowSize(5)
            .iterate(iter)
            .tokenizerFactory(t)
            .build();

    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("Writing word vectors to text file....");
    log.info("Closest Words:");
    Collection<String> lst = vec.wordsNearestSum("day", 10);
    log.info("10 Words closest to 'day': {}", lst);
    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(lst.contains("years"));
    assertTrue(lst.contains("time"));
}
 
Example 11
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 300000)
public void testParagraphVectorsWithWordVectorsModelling1() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //Skip CUDA except for integration tests due to very slow test speed
    }

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    //        InMemoryLookupCache cache = new InMemoryLookupCache(false);
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(3).epochs(1).layerSize(100)
                    .learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter).trainWordVectors(true)
                    .vocabCache(cache).tokenizerFactory(t).sampling(0).build();

    vec.fit();


    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    /*
        We have few lines that contain pretty close words invloved.
        These sentences should be pretty close to each other in vector space
     */
    // line 3721: This is my way .
    // line 6348: This is my case .
    // line 9836: This is my house .
    // line 12493: This is my world .
    // line 16393: This is my work .

    // this is special sentence, that has nothing common with previous sentences
    // line 9853: We now have one .

    assertTrue(vec.hasWord("DOC_3720"));

    double similarityD = vec.similarity("day", "night");
    log.info("day/night similarity: " + similarityD);

    double similarityW = vec.similarity("way", "work");
    log.info("way/work similarity: " + similarityW);

    double similarityH = vec.similarity("house", "world");
    log.info("house/world similarity: " + similarityH);

    double similarityC = vec.similarity("case", "way");
    log.info("case/way similarity: " + similarityC);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.7d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //        assertTrue(similarity2 > 0.7d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity2 > 0.7d);

    // likelihood in this case should be significantly lower
    // however, since corpus is small, and weight initialization is random-based, sometimes this test CAN fail
    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    double sim119 = vec.similarityToLabel("This is my case .", "DOC_6347");
    double sim120 = vec.similarityToLabel("This is my case .", "DOC_3720");
    log.info("1/2: " + sim119 + "/" + sim120);
    //assertEquals(similarity3, sim119, 0.001);
}
 
Example 12
Source File: WordVectorSerializerTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
 
Example 13
Source File: Word2VecTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testConcepts() throws Exception {
    // These are all default values for word2vec
    SparkConf sparkConf = new SparkConf().setMaster("local[8]")
            .set("spark.driver.host", "localhost")
            .setAppName("sparktest");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    String dataPath = new ClassPathResource("big/raw_sentences.txt").getFile().getAbsolutePath();
    //        dataPath = "/ext/Temp/part-00000";
    //        String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .stopWords(Arrays.asList("three")).useUnknown(true).build();

    word2Vec.train(corpus);

    //word2Vec.setModelUtils(new FlatModelUtils());

    System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK"));

    InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable();

    double sim = word2Vec.similarity("day", "night");
    System.out.println("day/night similarity: " + sim);
    /*
    System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce"));
    System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro"));
    
    Collection<String> portu = word2Vec.wordsNearest("carro", 10);
    printWords("carro", portu, word2Vec);
    
    portu = word2Vec.wordsNearest("davi", 10);
    printWords("davi", portu, word2Vec);
    
    System.out.println("---------------------------------------");
    */

    Collection<String> words = word2Vec.wordsNearest("day", 10);
    printWords("day", words, word2Vec);

    assertTrue(words.contains("night"));
    assertTrue(words.contains("week"));
    assertTrue(words.contains("year"));

    sim = word2Vec.similarity("two", "four");
    System.out.println("two/four similarity: " + sim);

    words = word2Vec.wordsNearest("two", 10);
    printWords("two", words, word2Vec);

    // three should be absent due to stopWords
    assertFalse(words.contains("three"));

    assertTrue(words.contains("five"));
    assertTrue(words.contains("four"));

    sc.stop();


    // test serialization
    File tempFile = testDir.newFile("temp" + System.currentTimeMillis() + ".tmp");

    int idx1 = word2Vec.vocab().wordFor("day").getIndex();

    INDArray array1 = word2Vec.getWordVectorMatrix("day").dup();

    VocabWord word1 = word2Vec.vocab().elementAtIndex(0);

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile);

    WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0);
    VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it");
    int idx2 = vectors.vocab().wordFor("day").getIndex();

    INDArray array2 = vectors.getWordVectorMatrix("day").dup();

    System.out.println("word 'i': " + word2);
    System.out.println("word 'it': " + wordIT);

    assertEquals(idx1, idx2);
    assertEquals(word1, word2);
    assertEquals(array1, array2);
}
 
Example 14
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 300000)
public void testParagraphVectorsDBOW() throws Exception {
    skipUnlessIntegrationTests();

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1)
                    .layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
                    .trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0)
                    .allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(4)
                    .usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();

    vec.fit();

    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn0().isAttached());
    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn1().isAttached());

    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);

    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    // testing DM inference now

    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");

    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));

    assertNotEquals(inferredA1, inferredC1);

    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());

    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);

}
 
Example 15
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testRunWord2Vec() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    // Strip white space before and after for each line
    /*val shakespear = new ClassPathResource("big/rnj.txt");
    SentenceIterator iter = new BasicLineIterator(shakespear.getFile());*/
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
                    //.negativeSample(10)
                    .epochs(1).windowSize(5).allowParallelTokenization(true)
                    .workers(6)
                    .usePreciseMode(true)
                    .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    File tempFile = File.createTempFile("temp", "temp");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
    Collection<String> lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(sim < 1.0);
    assertTrue(sim > 0.4);


    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    assertFalse(lst.contains(null));


    lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    new File("cache.ser").delete();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week"));

    WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
 
Example 16
Source File: Word2VecTests.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void reproducibleResults_ForMultipleRuns() throws Exception {
    String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend");
    if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) {
        skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X
    }

    log.info("reproducibleResults_ForMultipleRuns");
    val shakespear = new ClassPathResource("big/rnj.txt");
    val basic = new ClassPathResource("big/rnj.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec1 = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(1)
            .useHierarchicSoftmax(true)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    Word2Vec vec2 = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100)
            .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
            .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
            .epochs(1).windowSize(5).allowParallelTokenization(true)
            .workers(1)
            .useHierarchicSoftmax(true)
            .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec1.fit();

    iter.reset();

    vec2.fit();

    for (int e = 0; e < vec1.getVocab().numWords(); e++) {
        val w1 = vec1.getVocab().elementAtIndex(e);
        val w2 = vec2.getVocab().elementAtIndex(e);

        assertNotNull(w1);
        assertNotNull(w2);

        assertEquals(w1.getLabel(), w2.getLabel());

        assertArrayEquals("Failed for token [" + w1.getLabel() + "] at index [" + e + "]", Ints.toArray(w1.getPoints()), Ints.toArray(w2.getPoints()));
        assertArrayEquals("Failed for token [" + w1.getLabel() + "] at index [" + e + "]", Ints.toArray(w1.getCodes()), Ints.toArray(w2.getCodes()));
    }

    val syn0_from_vec1 = ((InMemoryLookupTable<VocabWord>) vec1.getLookupTable()).getSyn0();
    val syn0_from_vec2 = ((InMemoryLookupTable<VocabWord>) vec2.getLookupTable()).getSyn0();

    assertEquals(syn0_from_vec1, syn0_from_vec2);

    log.info("Day/night similarity: {}", vec1.similarity("day", "night"));
    val result = vec1.wordsNearest("day", 10);
    printWords("day", result, vec1);
}
 
Example 17
Source File: ParagraphVectorsTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * This test checks, how vocab is built using SentenceIterator provided, without labels.
 *
 * @throws Exception
 */
@Test(timeout = 2400000)
public void testParagraphVectorsVocabBuilding1() throws Exception {
    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file); //UimaSentenceIterator.createWithPath(file.getAbsolutePath());

    int numberOfLines = 0;
    while (iter.hasNext()) {
        iter.nextSentence();
        numberOfLines++;
    }

    iter.reset();

    InMemoryLookupCache cache = new InMemoryLookupCache(false);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    // LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                    //      .labelsGenerator(source)
                    .windowSize(5).iterate(iter).vocabCache(cache).tokenizerFactory(t).build();

    vec.buildVocab();

    LabelsSource source = vec.getLabelsSource();


    //VocabCache cache = vec.getVocab();
    log.info("Number of lines in corpus: " + numberOfLines);
    assertEquals(numberOfLines, source.getLabels().size());
    assertEquals(97162, source.getLabels().size());

    assertNotEquals(null, cache);
    assertEquals(97406, cache.numWords());

    // proper number of words for minWordsFrequency = 1 is 244
    assertEquals(244, cache.numWords() - source.getLabels().size());
}
 
Example 18
Source File: SequenceVectorsTest.java    From deeplearning4j with Apache License 2.0 3 votes vote down vote up
@Test
public void testInternalVocabConstruction() throws Exception {
    ClassPathResource resource = new ClassPathResource("big/raw_sentences.txt");
    File file = resource.getFile();

    BasicLineIterator underlyingIterator = new BasicLineIterator(file);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(new VectorsConfiguration())
                    .minWordFrequency(5).iterate(sequenceIterator).batchSize(250).iterations(1).epochs(1)
                    .resetModel(false).trainElementsRepresentation(true).build();


    logger.info("Fitting model...");

    vectors.fit();

    logger.info("Model ready...");

    double sim = vectors.similarity("day", "night");
    logger.info("Day/night similarity: " + sim);
    assertTrue(sim > 0.6d);

    Collection<String> labels = vectors.wordsNearest("day", 10);
    logger.info("Nearest labels to 'day': " + labels);
}
 
Example 19
Source File: InMemoryLookupTableTest.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
@Test(timeout = 300000)
public void testConsumeOnEqualVocabs() throws Exception {
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();


    File resource = Resources.asFile("big/raw_sentences.txt");

    BasicLineIterator underlyingIterator = new BasicLineIterator(resource);


    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();

    vocabConstructor.buildJointVocabulary(false, true);

    assertEquals(244, cacheSource.numWords());

    InMemoryLookupTable<VocabWord> mem1 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheSource).seed(17).build();

    mem1.resetWeights(true);

    InMemoryLookupTable<VocabWord> mem2 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheSource).seed(15).build();

    mem2.resetWeights(true);

    assertNotEquals(mem1.vector("day"), mem2.vector("day"));

    mem2.consume(mem1);

    assertEquals(mem1.vector("day"), mem2.vector("day"));

}
 
Example 20
Source File: InMemoryLookupTableTest.java    From deeplearning4j with Apache License 2.0 2 votes vote down vote up
@Test(timeout = 300000)
public void testConsumeOnNonEqualVocabs() throws Exception {
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    AbstractCache<VocabWord> cacheSource = new AbstractCache.Builder<VocabWord>().build();


    File resource = Resources.asFile("big/raw_sentences.txt");

    BasicLineIterator underlyingIterator = new BasicLineIterator(resource);


    SentenceTransformer transformer =
                    new SentenceTransformer.Builder().iterator(underlyingIterator).tokenizerFactory(t).build();

    AbstractSequenceIterator<VocabWord> sequenceIterator =
                    new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabConstructor = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheSource).build();

    vocabConstructor.buildJointVocabulary(false, true);

    assertEquals(244, cacheSource.numWords());

    InMemoryLookupTable<VocabWord> mem1 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheSource).build();

    mem1.resetWeights(true);



    AbstractCache<VocabWord> cacheTarget = new AbstractCache.Builder<VocabWord>().build();


    val dir = testDir.newFolder();
    new ClassPathResource("/paravec/labeled/").copyDirectory(dir);

    FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder()
                    .addSourceFolder(dir).build();

    transformer = new SentenceTransformer.Builder().iterator(labelAwareIterator).tokenizerFactory(t).build();

    sequenceIterator = new AbstractSequenceIterator.Builder<>(transformer).build();

    VocabConstructor<VocabWord> vocabTransfer = new VocabConstructor.Builder<VocabWord>()
                    .addSource(sequenceIterator, 1).setTargetVocabCache(cacheTarget).build();

    vocabTransfer.buildMergedVocabulary(cacheSource, true);

    // those +3 go for 3 additional entries in target VocabCache: labels
    assertEquals(cacheSource.numWords() + 3, cacheTarget.numWords());


    InMemoryLookupTable<VocabWord> mem2 =
                    (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().vectorLength(100)
                                    .cache(cacheTarget).seed(18).build();

    mem2.resetWeights(true);

    assertNotEquals(mem1.vector("day"), mem2.vector("day"));

    mem2.consume(mem1);

    assertEquals(mem1.vector("day"), mem2.vector("day"));

    assertTrue(mem1.syn0.rows() < mem2.syn0.rows());

    assertEquals(mem1.syn0.rows() + 3, mem2.syn0.rows());
}