org.apache.commons.math3.stat.correlation.Covariance Java Exaples

Source File: TestDoubleCovarianceSampAggregation.java From presto with Apache License 2.0

5 votes

@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 1) {
        return null;
    }
    return new Covariance().covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), true);
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: TestRealCovarianceSampAggregation.java From presto with Apache License 2.0

5 votes

@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 1) {
        return null;
    }
    return (float) new Covariance().covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), true);
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimensions();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: CorrelationExample.java From Java-Data-Analysis with MIT License

5 votes

static double rho(double[][] data) {
    Variance v = new Variance();
    double varX = v.evaluate(data[0]);
    double sigX = Math.sqrt(varX);
    double varY = v.evaluate(data[1]);
    double sigY = Math.sqrt(varY);
    Covariance c = new Covariance(data);
    double sigXY = c.covariance(data[0], data[1]);
    return sigXY/(sigX*sigY);
}

Source File: MultivariateNormalDistributionTest.java From astor with GNU General Public License v2.0

5 votes

/**
 * Test the accuracy of sampling from the distribution.
 */
@Test
public void testSampling() {
    final double[] mu = { -1.5, 2 };
    final double[][] sigma = { { 2, -1.1 },
                               { -1.1, 2 } };
    final MultivariateNormalDistribution d = new MultivariateNormalDistribution(mu, sigma);
    d.reseedRandomGenerator(50);

    final int n = 500000;

    final double[][] samples = d.sample(n);
    final int dim = d.getDimension();
    final double[] sampleMeans = new double[dim];

    for (int i = 0; i < samples.length; i++) {
        for (int j = 0; j < dim; j++) {
            sampleMeans[j] += samples[i][j];
        }
    }

    final double sampledValueTolerance = 1e-2;
    for (int j = 0; j < dim; j++) {
        sampleMeans[j] /= samples.length;
        Assert.assertEquals(mu[j], sampleMeans[j], sampledValueTolerance);
    }

    final double[][] sampleSigma = new Covariance(samples).getCovarianceMatrix().getData();
    for (int i = 0; i < dim; i++) {
        for (int j = 0; j < dim; j++) {
            Assert.assertEquals(sigma[i][j], sampleSigma[i][j], sampledValueTolerance);
        }
    }
}

Source File: TestDoubleCovariancePopAggregation.java From presto with Apache License 2.0

5 votes

@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 0) {
        return null;
    }
    if (length == 1) {
        return 0.;
    }
    Covariance covariance = new Covariance();
    return covariance.covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), false);
}

Source File: TestRealCovariancePopAggregation.java From presto with Apache License 2.0

5 votes

@Override
protected Object getExpectedValue(int start, int length)
{
    if (length <= 0) {
        return null;
    }
    if (length == 1) {
        return 0.f;
    }
    Covariance covariance = new Covariance();
    return (float) covariance.covariance(constructDoublePrimitiveArray(start + 5, length), constructDoublePrimitiveArray(start, length), false);
}

Source File: MultivariateNormalMixtureExpectationMaximization.java From astor with GNU General Public License v2.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: MultivariateNormalMixtureExpectationMaximization.java From astor with GNU General Public License v2.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>();

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: MultivariateNormalMixtureExpectationMaximization.java From astor with GNU General Public License v2.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateRealDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents\ is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 * @see #fit
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    final int totalBins = numComponents;

    // uniform weight for each bin
    final double weight = 1d / totalBins;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>();

    // create a component based on data in each bin
    for (int binNumber = 1; binNumber <= totalBins; binNumber++) {
        // minimum index from sorted data for this bin
        final int minIndex
            = (int) FastMath.max(0,
                                 FastMath.floor((binNumber - 1) * numRows / totalBins));

        // maximum index from sorted data for this bin
        final int maxIndex
            = (int) FastMath.ceil(binNumber * numRows / numComponents) - 1;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex + 1;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i <= maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() throws Exception {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: MultivariateNormalMixtureExpectationMaximization.java From astor with GNU General Public License v2.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: GLSMultipleLinearRegressionTest.java From astor with GNU General Public License v2.0

4 votes

/**
 * Generate an error covariance matrix and sample data representing models
 * with this error structure. Then verify that GLS estimated coefficients,
 * on average, perform better than OLS.
 */
@Test
public void testGLSEfficiency() {
    RandomGenerator rg = new JDKRandomGenerator();
    rg.setSeed(200);  // Seed has been selected to generate non-trivial covariance
    
    // Assume model has 16 observations (will use Longley data).  Start by generating
    // non-constant variances for the 16 error terms.
    final int nObs = 16;
    double[] sigma = new double[nObs];
    for (int i = 0; i < nObs; i++) {
        sigma[i] = 10 * rg.nextDouble();
    }
    
    // Now generate 1000 error vectors to use to estimate the covariance matrix
    // Columns are draws on N(0, sigma[col])
    final int numSeeds = 1000;
    RealMatrix errorSeeds = MatrixUtils.createRealMatrix(numSeeds, nObs);
    for (int i = 0; i < numSeeds; i++) {
        for (int j = 0; j < nObs; j++) {
            errorSeeds.setEntry(i, j, rg.nextGaussian() * sigma[j]);
        }
    }
    
    // Get covariance matrix for columns
    RealMatrix cov = (new Covariance(errorSeeds)).getCovarianceMatrix();
      
    // Create a CorrelatedRandomVectorGenerator to use to generate correlated errors
    GaussianRandomGenerator rawGenerator = new GaussianRandomGenerator(rg);
    double[] errorMeans = new double[nObs];  // Counting on init to 0 here
    CorrelatedRandomVectorGenerator gen = new CorrelatedRandomVectorGenerator(errorMeans, cov,
     1.0e-12 * cov.getNorm(), rawGenerator);
    
    // Now start generating models.  Use Longley X matrix on LHS
    // and Longley OLS beta vector as "true" beta.  Generate
    // Y values by XB + u where u is a CorrelatedRandomVector generated
    // from cov.
    OLSMultipleLinearRegression ols = new OLSMultipleLinearRegression();
    ols.newSampleData(longley, nObs, 6);
    final RealVector b = ols.calculateBeta().copy();
    final RealMatrix x = ols.getX().copy();
    
    // Create a GLS model to reuse
    GLSMultipleLinearRegression gls = new GLSMultipleLinearRegression();
    gls.newSampleData(longley, nObs, 6);
    gls.newCovarianceData(cov.getData());
    
    // Create aggregators for stats measuring model performance
    DescriptiveStatistics olsBetaStats = new DescriptiveStatistics();
    DescriptiveStatistics glsBetaStats = new DescriptiveStatistics();
    
    // Generate Y vectors for 10000 models, estimate GLS and OLS and
    // Verify that OLS estimates are better
    final int nModels = 10000;
    for (int i = 0; i < nModels; i++) {
        
        // Generate y = xb + u with u cov
        RealVector u = MatrixUtils.createRealVector(gen.nextVector());
        double[] y = u.add(x.operate(b)).toArray();
        
        // Estimate OLS parameters
        ols.newYSampleData(y);
        RealVector olsBeta = ols.calculateBeta();
        
        // Estimate GLS parameters
        gls.newYSampleData(y);
        RealVector glsBeta = gls.calculateBeta();
        
        // Record deviations from "true" beta
        double dist = olsBeta.getDistance(b);
        olsBetaStats.addValue(dist * dist);
        dist = glsBeta.getDistance(b);
        glsBetaStats.addValue(dist * dist);
        
    }
    
    // Verify that GLS is on average more efficient, lower variance
    assert(olsBetaStats.getMean() > 1.5 * glsBetaStats.getMean());
    assert(olsBetaStats.getStandardDeviation() > glsBetaStats.getStandardDeviation());  
}

Source File: MultivariateNormalMixtureExpectationMaximization.java From astor with GNU General Public License v2.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: StrategyFilter.java From iMetrica with GNU General Public License v3.0

4 votes

public static double[] maximizeSharpe(double[][] data, int n_basket, int nobs, int nonneg)
   {
   
      int i,j; 
      double[] means = new double[n_basket];
      double sum=0;
      RealVector sol; 
      double[] w = new double[n_basket]; 
       
      for(i=0;i<n_basket;i++)
      {
       sum=0;
       for(j=0;j<nobs;j++)
       {
        sum = sum + data[j][i];        
       }
       means[i] = sum/nobs;
       //System.out.println(means[i]);
      } 
       
      RealVector m = new ArrayRealVector(means, false);
      Covariance covComp = new Covariance(data);
       
      //LinearConstraint(double[] coefficients, Relationship relationship, double value)
      //public static final Relationship LEQ 
      //RealMatrix rm = covComp.scalarMultiply(10000);
      RealMatrix rm = covComp.getCovarianceMatrix();
//       rm = rm.scalarMultiply(1000000);
//       for(i=0;i<n_basket;i++)
//       {printRow(rm.getRow(i));}
      
       
      try
      { 
        DecompositionSolver solver = new QRDecomposition(rm).getSolver();
        sol = solver.solve(m);
        w = sol.toArray(); 
      }
      catch(SingularMatrixException sme) 
      {
       //System.out.println("Matrix singular: setting weights to uniform"); 
       w = new double[n_basket]; 
       for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
      }
      
      double sumw = 0;
      for(i=0;i<w.length;i++) 
      {
        if(nonneg == 1)
        {if(w[i] < 0) {w[i] = 1.0/n_basket;}}
        else if(nonneg == 2)
        {w[i] = Math.abs(w[i]);}
        
        sumw = sumw + w[i]; 
      }
       
      for(i=0;i<w.length;i++) {w[i] = w[i]/sumw;}
    
      return w; 
  }

Source File: EvolutionPanel.java From iMetrica with GNU General Public License v3.0

4 votes

public void fuseStrategies()
{
  if(n_saved_perf > 0)
  {
  
    int i,j,k; int npos = 0;
    int n_basket = n_saved_perf+1;
    int min_obs = mdfaEvolutionCanvas.min_obs;
    double[][] data = new double[min_obs][n_basket];
    double[] w = new double[n_basket];
    double[] target = new double[min_obs];
    //fill with current strategy first
    for(i=0;i<min_obs;i++)
    { 
      data[min_obs - 1 - i][0] = performances[performances.length - 1 - i].getReturn();        
    }
    
    for(k=0;k<n_saved_perf;k++)
    { 
     JInvestment[] temp = portfolio_invest.get(k);
     for(i=0;i<min_obs;i++)
     { 
      data[min_obs - 1 - i][k+1] = temp[temp.length - 1 - i].getReturn();
     } 
    }

    double[] means = new double[n_basket];
    double sum=0;
    RealVector sol; 
     
    for(i=0;i<n_basket;i++)
    {
     sum=0;
     for(j=0;j<min_obs;j++)
     {sum = sum + data[j][i];}
     means[i] = sum/min_obs;
    } 
     
    RealVector m = new ArrayRealVector(means, false);
    Covariance covComp = new Covariance(data);
    RealMatrix rm = covComp.getCovarianceMatrix();  

    if(uniformWeightsCheck.isSelected())
    {
      for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
    }
    else if(maxSharpeWeightsCheck.isSelected())
    {
    
     try
     { 
      DecompositionSolver solver = new QRDecomposition(rm).getSolver();
      sol = solver.solve(m);
      w = sol.toArray(); 
     }
     catch(SingularMatrixException sme) 
     {
       System.out.println("Matrix singular: setting weights to uniform"); 
       w = new double[n_basket]; 
       for(i=0;i<n_basket;i++) {w[i] = 1.0/n_basket;}
     }
    
     double sumw = 0;
     for(i=0;i<w.length;i++) 
     {
      if(w[i] < 0) {w[i] = 1.0/n_basket;}       
      sumw = sumw + w[i]; 
     }
     for(i=0;i<w.length;i++) {w[i] = w[i]/sumw;}
    }   
    
    for(i=0;i<min_obs;i++)
    {
      sum = 0;
      for(k=0;k<n_basket;k++)
      {sum = sum + data[i][k]*w[k];}
      target[i] = sum;  
      if(target[i] > 0) {npos++;}
    }
    
    double[] mstd = mean_std(target); 
    sharpe_ratio = Math.sqrt(250)*mstd[0]/mstd[1];    
    double[] cum_port_returns = cumsum(target,min_obs); 
    

    
    max_drawdown = computeDrawdown(cum_port_returns);
   
    if(realrets) 
    {cum_port_returns = cumsum(target,target.length);}     
    
    double bRatio = (double)npos/min_obs;      
    
    mdfaEvolutionCanvas.addAggregate(cum_port_returns, new String(""+df2.format(sharpe_ratio)+", " +df2.format(max_drawdown)+", "+df.format(bRatio)));
    
  }
}

Source File: GmmSemi.java From orbit-image-analysis with GNU General Public License v3.0

4 votes

/**
 * Helper method to create a multivariate normal mixture model which can be
 * used to initialize {@link #fit(MixtureMultivariateNormalDistribution)}.
 *
 * This method uses the data supplied to the constructor to try to determine
 * a good mixture model at which to start the fit, but it is not guaranteed
 * to supply a model which will find the optimal solution or even converge.
 *
 * @param data Data to estimate distribution
 * @param numComponents Number of components for estimated mixture
 * @return Multivariate normal mixture model estimated from the data
 * @throws NumberIsTooLargeException if {@code numComponents} is greater
 * than the number of data rows.
 * @throws NumberIsTooSmallException if {@code numComponents < 2}.
 * @throws NotStrictlyPositiveException if data has less than 2 rows
 * @throws DimensionMismatchException if rows of data have different numbers
 *             of columns
 */
public static MixtureMultivariateNormalDistribution estimate(final double[][] data,
                                                             final int numComponents)
    throws NotStrictlyPositiveException,
           DimensionMismatchException {
    if (data.length < 2) {
        throw new NotStrictlyPositiveException(data.length);
    }
    if (numComponents < 2) {
        throw new NumberIsTooSmallException(numComponents, 2, true);
    }
    if (numComponents > data.length) {
        throw new NumberIsTooLargeException(numComponents, data.length, true);
    }

    final int numRows = data.length;
    final int numCols = data[0].length;

    // sort the data
    final DataRow[] sortedData = new DataRow[numRows];
    for (int i = 0; i < numRows; i++) {
        sortedData[i] = new DataRow(data[i]);
    }
    Arrays.sort(sortedData);

    // uniform weight for each bin
    final double weight = 1d / numComponents;

    // components of mixture model to be created
    final List<Pair<Double, MultivariateNormalDistribution>> components =
            new ArrayList<Pair<Double, MultivariateNormalDistribution>>(numComponents);

    // create a component based on data in each bin
    for (int binIndex = 0; binIndex < numComponents; binIndex++) {
        // minimum index (inclusive) from sorted data for this bin
        final int minIndex = (binIndex * numRows) / numComponents;

        // maximum index (exclusive) from sorted data for this bin
        final int maxIndex = ((binIndex + 1) * numRows) / numComponents;

        // number of data records that will be in this bin
        final int numBinRows = maxIndex - minIndex;

        // data for this bin
        final double[][] binData = new double[numBinRows][numCols];

        // mean of each column for the data in the this bin
        final double[] columnMeans = new double[numCols];

        // populate bin and create component
        for (int i = minIndex, iBin = 0; i < maxIndex; i++, iBin++) {
            for (int j = 0; j < numCols; j++) {
                final double val = sortedData[i].getRow()[j];
                columnMeans[j] += val;
                binData[iBin][j] = val;
            }
        }

        MathArrays.scaleInPlace(1d / numBinRows, columnMeans);

        // covariance matrix for this bin
        final double[][] covMat
            = new Covariance(binData).getCovarianceMatrix().getData();
        final MultivariateNormalDistribution mvn
            = new MultivariateNormalDistribution(columnMeans, covMat);

        components.add(new Pair<Double, MultivariateNormalDistribution>(weight, mvn));
    }

    return new MixtureMultivariateNormalDistribution(components);
}

Source File: CovarianceTest.java From Java-Data-Science-Cookbook with MIT License

4 votes

public void calculateCov(double[] x, double[] y){
	double covariance = new Covariance().covariance(x, y, false);//take out false too
	System.out.println(covariance);
}

Source File: StatsUtil.java From MeteoInfo with GNU Lesser General Public License v3.0

3 votes

/**
 * Computes covariance of two arrays.
 *
 * @param x X data
 * @param y Y data
 * @param bias If true, returned value will be bias-corrected
 * @return The covariance
 */
public static double covariance(Array x, Array y, boolean bias){
    double[] xd = (double[]) ArrayUtil.copyToNDJavaArray_Double(x);
    double[] yd = (double[]) ArrayUtil.copyToNDJavaArray_Double(y);
    double r = new Covariance().covariance(xd, yd, bias);
    return r;
}

Source File: Matrix.java From buffer_bci with GNU General Public License v3.0

2 votes

/**
 * Covariance of the columns of the matrix
 *
 * @return covariance matrix with size columnsxcolumns
 */
public Matrix covariance() {
    Covariance cov = new Covariance(this.transpose(), true);
    return new Matrix(cov.getCovarianceMatrix());
}

org.apache.commons.math3.stat.correlation.Covariance Java Examples