org.apache.spark.sql.Dataset Java Exaples

Source File: SparkDataSet.java From spliceengine with GNU Affero General Public License v3.0

7 votes

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
public DataSet< V> intersect(DataSet< V> dataSet, String name, OperationContext context, boolean pushScope, String scopeDetail) throws StandardException {
    pushScopeIfNeeded(context, pushScope, scopeDetail);
    try {
        //Convert this rdd backed iterator to a Spark untyped dataset
        Dataset<Row> left = SpliceSpark.getSession()
                .createDataFrame(
                    rdd.map(
                        new LocatedRowToRowFunction()),
                    context.getOperation()
                           .getExecRowDefinition()
                           .schema());

        return new NativeSparkDataSet(left, context).intersect(dataSet, name, context, pushScope, scopeDetail);
    }finally {
        if (pushScope) context.popScope();
    }
}

Source File: WindowAssignTranslatorBatch.java From beam with Apache License 2.0

6 votes

@Override
public void translateTransform(
    PTransform<PCollection<T>, PCollection<T>> transform, TranslationContext context) {

  Window.Assign<T> assignTransform = (Window.Assign<T>) transform;
  @SuppressWarnings("unchecked")
  final PCollection<T> input = (PCollection<T>) context.getInput();
  @SuppressWarnings("unchecked")
  final PCollection<T> output = (PCollection<T>) context.getOutput();

  Dataset<WindowedValue<T>> inputDataset = context.getDataset(input);
  if (WindowingHelpers.skipAssignWindows(assignTransform, context)) {
    context.putDataset(output, inputDataset);
  } else {
    WindowFn<T, ?> windowFn = assignTransform.getWindowFn();
    WindowedValue.FullWindowedValueCoder<T> windowedValueCoder =
        WindowedValue.FullWindowedValueCoder.of(input.getCoder(), windowFn.windowCoder());
    Dataset<WindowedValue<T>> outputDataset =
        inputDataset.map(
            WindowingHelpers.assignWindowsMapFunction(windowFn),
            EncoderHelpers.fromBeamCoder(windowedValueCoder));
    context.putDataset(output, outputDataset);
  }
}

Source File: TestInListDeriver.java From envelope with Apache License 2.0

6 votes

@Test
public void testWrongField() throws Exception {
  thrown.expect(RuntimeException.class);
  thrown.expectMessage("Error executing IN list filtering");

  Dataset<Row> source = createTestDataframe();
  List<String> inListLiteral = Arrays.asList("1", "2", "3");

  Map<String, Dataset<Row>> dependencies = new HashMap<>();
  dependencies.put("df1", source);

  Config config = ConfigFactory.empty()
      .withValue(InListDeriver.INLIST_STEP_CONFIG, ConfigValueFactory.fromAnyRef("df1"))
      .withValue(InListDeriver.INLIST_FIELD_CONFIG, ConfigValueFactory.fromAnyRef("non_existing_field"))
      .withValue(InListDeriver.INLIST_VALUES_CONFIG, ConfigValueFactory.fromIterable(inListLiteral));

  InListDeriver deriver = new InListDeriver();

  assertNoValidationFailures(deriver, config);
  deriver.configure(config);

  deriver.derive(dependencies);
}

Source File: TestImpalaMetadataTask.java From envelope with Apache License 2.0

6 votes

@Test
public void testDeriveDropRangePartitionBoundariesQuery() {
  Map<String, Object> configMap = new HashMap<>();
  configMap.put(HOST_CONFIG, "testhost");
  configMap.put(QUERY_TYPE_CONFIG, "drop_partition");
  configMap.put(QUERY_TABLE_CONFIG, "testtable");
  configMap.put(QUERY_PART_RANGE_START_CONFIG, "20190122");
  configMap.put(QUERY_PART_RANGE_END_CONFIG, "20190123");
  configMap.put(AUTH_CONFIG, "none");
  Config config = ConfigFactory.parseMap(configMap);
  ImpalaMetadataTask metadataTask = new ImpalaMetadataTask();
  metadataTask.configure(config);

  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  String query = metadataTask.deriveQuery(dependencies);

  assertEquals("ALTER TABLE testtable DROP IF EXISTS RANGE PARTITION 20190122 <= VALUES < 20190123", query);
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we rely only on the target number of manifests for unpartitioned tables
  // as we should not worry about having too much metadata per partition
  long maxNumManifestEntries = Long.MAX_VALUE;

  return manifestEntryDF
      .repartition(numManifests)
      .mapPartitions(
          toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
          manifestEncoder
      )
      .collectAsList();
}

Source File: ITTestHDFSParquetImportCommand.java From hudi with Apache License 2.0

6 votes

/**
 * Method to verify result is equals to expect.
 */
private void verifyResultData(List<GenericRecord> expectData) {
  Dataset<Row> ds = HoodieClientTestUtils.read(jsc, tablePath, sqlContext, fs, tablePath + "/*/*/*/*");

  List<Row> readData = ds.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon").collectAsList();
  List<HoodieTripModel> result = readData.stream().map(row ->
      new HoodieTripModel(row.getDouble(0), row.getString(1), row.getString(2), row.getString(3), row.getDouble(4),
          row.getDouble(5), row.getDouble(6), row.getDouble(7)))
      .collect(Collectors.toList());

  List<HoodieTripModel> expected = expectData.stream().map(g ->
      new HoodieTripModel(Double.parseDouble(g.get("timestamp").toString()),
          g.get("_row_key").toString(),
          g.get("rider").toString(),
          g.get("driver").toString(),
          Double.parseDouble(g.get("begin_lat").toString()),
          Double.parseDouble(g.get("begin_lon").toString()),
          Double.parseDouble(g.get("end_lat").toString()),
          Double.parseDouble(g.get("end_lon").toString())))
      .collect(Collectors.toList());

  assertAll("Result list equals",
      () -> assertEquals(expected.size(), result.size()),
      () -> assertTrue(result.containsAll(expected) && expected.containsAll(result)));
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testOutputDataFrameFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDF();
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
	List<Row> list = sortedDF.collectAsList();
	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Assert.assertEquals(1.0, row1.getDouble(1), 0.0);
	Assert.assertEquals(2.0, row1.getDouble(2), 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Assert.assertEquals(3.0, row2.getDouble(1), 0.0);
	Assert.assertEquals(4.0, row2.getDouble(2), 0.0);
}

Source File: TestPassthroughDeriver.java From envelope with Apache License 2.0

6 votes

@Test (expected = RuntimeException.class)
public void testDifferentSchemas() throws Exception {
  StructType schema1 = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col1", DataTypes.StringType, false)));
  StructType schema2 = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col2", DataTypes.StringType, false)));
  Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("a")), schema1);
  Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("b")), schema2);
  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dep1);
  dependencies.put("dep2", dep2);

  Deriver deriver = new PassthroughDeriver();

  deriver.derive(dependencies).collectAsList();
}

Source File: NExecAndComp.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static Dataset<Row> sql(String prj, String sqlText, List<String> parameters) {
    if (sqlText == null)
        throw new RuntimeException("Sorry your SQL is null...");

    try {
        logger.info("Try to query from cube....");
        long startTs = System.currentTimeMillis();
        Dataset<Row> dataset = queryCubeAndSkipCompute(prj, sqlText, parameters);
        logger.info("Cool! This sql hits cube...");
        logger.info("Duration(ms): {}", (System.currentTimeMillis() - startTs));
        return dataset;
    } catch (Throwable e) {
        logger.error("There is no cube can be used for query [{}]", sqlText);
        logger.error("Reasons:", e);
        throw new RuntimeException("Error in running query [ " + sqlText.trim() + " ]", e);
    }
}

Source File: DataFrameIT.java From spliceengine with GNU Affero General Public License v3.0

6 votes

public static void testResultSetToDF(String table, ResultSet[] resultSets) throws SQLException {

    try{
        Connection conn = DriverManager.getConnection("jdbc:default:connection");
        PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase());
        ResultSet res = pstmt.executeQuery();
        // Convert result set to Dataframe
        Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res);
        resultSets[0] = res;

            // Construct Stored Procedure Result
            List<ExecRow> rows = Lists.newArrayList();
            ExecRow row = new ValueRow(1);
            // System.out.println(resultSetDF.dataset().count());
            row.setColumn(1, new SQLLongint(resultSetDF.count()));
            rows.add(row);
            IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_COUNT_STORED_PROCEDURE_COLUMN_DECSRIPTOR);
            resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true);

            conn.close();
        }
        catch (StandardException e) {
            throw new SQLException(Throwables.getRootCause(e));
        }
    }

Source File: CommonAddressFeaturesBridgeTest.java From spark-transformers with Apache License 2.0

6 votes

private void assertCorrectness(Dataset<Row> rowDataset, Transformer transformer) {
	List<Row> sparkOutput = rowDataset.collectAsList();

	for (Row row : sparkOutput) {
		Map<String, Object> data = new HashMap<>();
		data.put("mergedAddress", row.get(0));

		List<Object> list = row.getList(1);
		String[] sanitizedAddress = new String[list.size()];
		for (int j = 0; j < sanitizedAddress.length; j++) {
			sanitizedAddress[j] = (String) list.get(j);
		}

		data.put("sanitizedAddress", sanitizedAddress);
		transformer.transform(data);

		assertEquals("number of words should be equals", row.get(2), data.get("numWords"));
		assertEquals("number of commas should be equals", row.get(3), data.get("numCommas"));
		assertEquals("numericPresent should be equals", row.get(4), data.get("numericPresent"));
		assertEquals("addressLength should be equals", row.get(5), data.get("addressLength"));
		assertEquals("favouredStart should be equals", row.get(6), data.get("favouredStart"));
		assertEquals("unfavouredStart should be equals", row.get(7), data.get("unfavouredStart"));
	}
}

Source File: Normalization.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static Dataset<Row> normalize(Dataset<Row> dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataFrame.withColumn(columnName, newCol);
    }


    return dataFrame;
}

Source File: SparkRelationalOperator.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (relOpKind == EQUALS_RELOP)
        return leftExpr.equalTo(rightExpr);
    else if (relOpKind == NOT_EQUALS_RELOP)
        return leftExpr.notEqual(rightExpr);
    else if (relOpKind == GREATER_THAN_RELOP)
        return leftExpr.gt(rightExpr);
    else if (relOpKind == GREATER_EQUALS_RELOP)
        return leftExpr.geq(rightExpr);
    else if (relOpKind == LESS_THAN_RELOP)
        return leftExpr.lt(rightExpr);
    else if (relOpKind == LESS_EQUALS_RELOP)
        return leftExpr.leq(rightExpr);
    else if (relOpKind == IS_NULL_RELOP)
        return leftExpr.isNull();
    else if (relOpKind == IS_NOT_NULL_RELOP)
        return leftExpr.isNotNull();
    else
        throw new UnsupportedOperationException();
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testDataFrameSumDMLMllibVectorWithIDColumn() {
	System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column");

	List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>();
	list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
	list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
	list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
	JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);

	JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
	fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);

	MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);

	Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
	setExpectedStdOut("sum: 45.0");
	ml.execute(script);
}

Source File: TestTranslateFunction.java From envelope with Apache License 2.0

6 votes

@Test
public void testExplicitDontAppendRaw() {
  Map<String, Object> configMap = Maps.newHashMap();
  configMap.put(ComponentFactory.TYPE_CONFIG_NAME, DummyTranslator.class.getName());
  configMap.put(TranslateFunction.APPEND_RAW_ENABLED_CONFIG, false);
  Config config = ConfigFactory.parseMap(configMap);

  TranslateFunction tf = new TranslateFunction(config);
  tf.receiveProvidedSchema(tf.getExpectingSchema());
  Dataset<Row> raw = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("hello?")), tf.getExpectingSchema());
  Dataset<Row> translated = raw.flatMap(tf, RowEncoder.apply(tf.getProvidingSchema()));

  assertEquals(1, translated.schema().size());
  assertNotEquals("_value", translated.schema().fields()[0].name());
}

Source File: MLContextTest.java From systemds with Apache License 2.0

6 votes

@Test
public void testOutputDataFrameDoublesWithIDColumnFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame of doubles with ID column from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=2, cols=2);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDFDoubleWithIDColumn();
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Assert.assertEquals(1.0, row1.getDouble(1), 0.0);
	Assert.assertEquals(2.0, row1.getDouble(2), 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Assert.assertEquals(3.0, row2.getDouble(1), 0.0);
	Assert.assertEquals(4.0, row2.getDouble(2), 0.0);
}

Source File: AbstractJavaEsSparkStructuredStreamingTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void test1WriteWithMappingExclude() throws Exception {
    String target = wrapIndex(resource("test-mapping-exclude", "data"));
    JavaStreamingQueryTestHarness<RecordBean> test = new JavaStreamingQueryTestHarness<>(spark, Encoders.bean(RecordBean.class));

    RecordBean doc1 = new RecordBean();
    doc1.setId(1);
    doc1.setName("Spark");

    RecordBean doc2 = new RecordBean();
    doc2.setId(2);
    doc2.setName("Hadoop");

    RecordBean doc3 = new RecordBean();
    doc3.setId(3);
    doc3.setName("YARN");

    Dataset<RecordBean> dataset = test
            .withInput(doc1)
            .withInput(doc2)
            .withInput(doc3)
            .stream();

    test.run(
            dataset.writeStream()
                    .option("checkpointLocation", checkpoint(target))
                    .option(ES_MAPPING_EXCLUDE, "name")
                    .format("es"),
            target
    );

    assertTrue(RestUtils.exists(target));
    assertThat(RestUtils.get(target + "/_search?"), not(containsString("Spark")));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("Hadoop")));
    assertThat(RestUtils.get(target +  "/_search?"), not(containsString("YARN")));
}

Source File: BookUrlBuilderApp.java From net.jgp.labs.spark with Apache License 2.0

5 votes

private void start() {
  SparkSession spark = SparkSession.builder().appName("Book URL Builder")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "true")
      .load(filename);
  df.show();

  Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
  ds.printSchema();
  ds.show(20, 80);
}

Source File: SparkEngine.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
public Enumerable<Object> computeSCALA(DataContext dataContext, RelNode relNode, RelDataType resultType) {
    Dataset<Row> sparkPlan = toSparkPlan(dataContext, relNode);
    log.debug("SPARK LOGICAL PLAN {}", sparkPlan.queryExecution().logical());
    return ResultPlan.getResult(sparkPlan, resultType, ResultType.SCALA()).right().get();

}

Source File: TextEncodedTelemetryReader.java From metron with Apache License 2.0

5 votes

@Override
public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) {
  String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class);
  if(inputFormat == null) {
    inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class);
  }
  LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat);

  return spark
          .read()
          .options(Maps.fromProperties(readerProps))
          .format(inputFormat)
          .load(inputPath)
          .as(Encoders.STRING());
}

Source File: IcebergSourceFlatAvroDataReadBenchmark.java From iceberg with Apache License 2.0

5 votes

@Benchmark
@Threads(1)
public void readWithProjectionFileSource() {
  Map<String, String> conf = Maps.newHashMap();
  conf.put(SQLConf.FILES_OPEN_COST_IN_BYTES().key(), Integer.toString(128 * 1024 * 1024));
  withSQLConf(conf, () -> {
    Dataset<Row> df = spark().read().format("avro").load(dataLocation()).select("longCol");
    materialize(df);
  });
}

Source File: MockValueSets.java From bunsen with Apache License 2.0

5 votes

/**
 * Creates a MockValueSets instance with the given data.
 */
public MockValueSets(SparkSession spark,
    Dataset<UrlAndVersion> members,
    Dataset<Row> valueSets,
    Dataset<Value> values,
    SparkRowConverter valueSetRowConverter) {

  super(spark, FhirVersionEnum.DSTU3, members, valueSets, values, valueSetRowConverter);
}

Source File: BatchStep.java From envelope with Apache License 2.0

5 votes

public void submit(Set<Step> dependencySteps) throws Exception {
  Contexts.getSparkSession().sparkContext().setJobDescription("Step: " + getName());

  Dataset<Row> data;
  Dataset<Row> errored = null;
  if (hasInput()) {
    data = ((BatchInput)getInput(true)).read();

    if (getInput(true) instanceof CanReturnErroredData) {
      errored = ((CanReturnErroredData) getInput(true)).getErroredData();
    }
  }
  else if (hasDeriver()) {
    Map<String, Dataset<Row>> dependencies = StepUtils.getStepDataFrames(dependencySteps);
    data = getDeriver(true).derive(dependencies);
    if (getDeriver(true) instanceof CanReturnErroredData) {
      errored = ((CanReturnErroredData) getDeriver(true)).getErroredData();
    }
  }
  else {
    throw new RuntimeException("Batch step '" + getName() + "' must contain either an input or a deriver.");
  }

  if (errored != null) {
    BatchStep erroredBatchStep = new BatchStep(getName() + DEFAULT_ERROR_DATAFRAME_SUFFIX);
    erroredBatchStep.configure(ConfigFactory.empty());
    erroredBatchStep.setData(errored);
    erroredBatchStep.setState(StepState.FINISHED);
    addNewBatchStep(erroredBatchStep);
  }
  
  if (doesRepartition()) {
    data = repartition(data);
  }

  setData(data);
  writeData();

  setState(StepState.FINISHED);
}

Source File: VectorizedReadDictionaryEncodedFlatParquetDataBenchmark.java From iceberg with Apache License 2.0

5 votes

private static Dataset<Row> withIntColumnDictEncoded(Dataset<Row> df) {
  return df.withColumn(
      "intCol",
      when(modColumn(9, 0), lit(0))
          .when(modColumn(9, 1), lit(1))
          .when(modColumn(9, 2), lit(2))
          .when(modColumn(9, 3), lit(3))
          .when(modColumn(9, 4), lit(4))
          .when(modColumn(9, 5), lit(5))
          .when(modColumn(9, 6), lit(6))
          .when(modColumn(9, 7), lit(7))
          .when(modColumn(9, 8), lit(8)));
}

Source File: Snomed.java From bunsen with Apache License 2.0

5 votes

/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
    String snomedRelationshipPath) {

  return spark.read()
      .option("header", true)
      .option("delimiter", "\t")
      .csv(snomedRelationshipPath)
      .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
      .where(col("active").equalTo(lit("1")))
      .select(col("destinationId"), col("sourceId"))
      .where(col("destinationId").isNotNull()
          .and(col("destinationId").notEqual(lit(""))))
      .where(col("sourceId").isNotNull()
          .and(col("sourceId").notEqual(lit(""))))
      .map((MapFunction<Row, HierarchicalElement>) row -> {

        HierarchicalElement element = new HierarchicalElement();

        element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
        element.setAncestorValue(row.getString(0));

        element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
        element.setDescendantValue(row.getString(1));

        return element;
      }, Hierarchies.getHierarchicalElementEncoder());
}

Source File: AbstractJavaEsSparkSQLTest.java From elasticsearch-hadoop with Apache License 2.0

5 votes

@Test
public void testBasicRead() throws Exception {
       Dataset<Row> dataset = artistsAsDataset();
       assertTrue(dataset.count() > 300);
       dataset.createOrReplaceTempView("datfile");
       assertEquals(5, ((Object[]) dataset.take(5)).length);
       Dataset<Row> results = sqc
			.sql("SELECT name FROM datfile WHERE id >=1 AND id <=10");
       assertEquals(10, ((Object[]) results.take(10)).length);
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}

Source File: NFilePruningTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception {
    Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql);
    dataset.collect();
    long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value();
    Assert.assertEquals(numScanFiles, actualNum);
    return actualNum;
}

Source File: CsvSourceTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Test
public void testGetFlatTable() throws IOException {
    System.out.println(getTestConfig().getMetadataUrl());
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(CUBE_NAME);
    cleanupSegments(CUBE_NAME);
    DataModelDesc model = cube.getModel();
    CubeSegment segment = cubeMgr.appendSegment(cube, new SegmentRange.TSRange(dateToLong("2010-01-01"), dateToLong("2013-01-01")));
    Dataset<Row> ds = initFlatTable(segment);
    ds.show(10);
    StructType schema = ds.schema();

    SegmentInfo segmentInfo = MetadataConverter.getSegmentInfo(segment.getCubeInstance(), segment.getUuid(),
            segment.getName(), segment.getStorageLocationIdentifier());
    scala.collection.immutable.Map<String, String> map = BuildUtils.getColumnIndexMap(segmentInfo);
    for (StructField field : schema.fields()) {
        Assert.assertNotNull(model.findColumn(map.apply(field.name())));
    }

    for (LayoutEntity layoutEntity : MetadataConverter.extractEntityList2JavaList(cube)) {
        Set<Integer> dims = layoutEntity.getOrderedDimensions().keySet();
        Column[] modelCols = new Column[dims.size()];
        int index = 0;
        for (int id : dims) {
            modelCols[index] = new Column(String.valueOf(id));
            index++;
        }
        ds.select(modelCols).show(10);
    }
}

Source File: SparkMLDeriver.java From envelope with Apache License 2.0

5 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  if (model == null) {
    model = PipelineModel.load(modelPath);
  }

  Dataset<Row> data = getData(dependencies);

  return model.transform(data);
}

org.apache.spark.sql.Dataset Java Examples