java.io.FilenameFilter Scala Examples
The following examples show how to use java.io.FilenameFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 2
Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.{File, FilenameFilter} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} class ParquetProjectionTest extends FlatSpec with Matchers { cleanUpResidualParquetTestFiles private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val file = new File(s"test_${System.currentTimeMillis()}.pq") file.deleteOnExit() private val path = new Path(file.toURI) if (fs.exists(path)) fs.delete(path, false) ds.to(ParquetSink(path).withOverwrite(true)) "ParquetSource" should "support projections" in { val rows = ParquetSource(path).withProjection("name").toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john")) } it should "return all data when no projection is set" in { val rows = ParquetSource(path).toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner")) } private def cleanUpResidualParquetTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc")) } }).foreach(_.delete()) } }
Example 3
Source File: OrcPredicateTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.io.{File, FilenameFilter} import io.eels.Predicate import io.eels.datastream.DataStream import io.eels.schema.{Field, LongType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} class OrcPredicateTest extends FlatSpec with Matchers with BeforeAndAfterAll { cleanUpResidualOrcTestFiles val schema = StructType( Field("name", StringType, nullable = true), Field("city", StringType, nullable = true), Field("age", LongType.Signed, nullable = true) ) val values = Vector.fill(1000) { Vector("sam", "middlesbrough", 37) } ++ Vector.fill(1000) { Vector("laura", "iowa city", 24) } val ds = DataStream.fromValues(schema, values) implicit val conf = new Configuration() implicit val fs = FileSystem.get(new Configuration()) val path = new Path("test.orc") if (fs.exists(path)) fs.delete(path, false) new File(path.toString).deleteOnExit() ds.to(OrcSink(path).withRowIndexStride(1000)) override protected def afterAll(): Unit = fs.delete(path, false) "OrcSource" should "support string equals predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support gt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.gt("age", 30L)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("sam", "middlesbrough", 37L)) } it should "support lt predicates" in { conf.set("eel.orc.predicate.row.filter", "false") val rows = OrcSource(path).withPredicate(Predicate.lt("age", 30)).toDataStream().collect rows.map(_.values).toSet shouldBe Set(Vector("laura", "iowa city", 24L)) } it should "enable row level filtering with predicates by default" in { conf.set("eel.orc.predicate.row.filter", "true") val rows = OrcSource(path).withPredicate(Predicate.equals("name", "sam")).toDataStream().collect rows.head.schema shouldBe schema rows.head.values shouldBe Vector("sam", "middlesbrough", 37L) } private def cleanUpResidualOrcTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".orc")) || (name.startsWith(".test_") && name.endsWith(".orc.crc")) } }).foreach(_.delete()) } }
Example 4
Source File: HadoopFsRelationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") && !name.startsWith("_") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize)) } } test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") { import testImplicits._ Seq(1.0, 0.5).foreach { compressionFactor => withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString, "spark.sql.autoBroadcastJoinThreshold" -> "400") { withTempPath { workDir => // the file size is 740 bytes val workDirPath = workDir.getAbsolutePath val data1 = Seq(100, 200, 300, 400).toDF("count") data1.write.parquet(workDirPath + "/data1") val df1FromFile = spark.read.parquet(workDirPath + "/data1") val data2 = Seq(100, 200, 300, 400).toDF("count") data2.write.parquet(workDirPath + "/data2") val df2FromFile = spark.read.parquet(workDirPath + "/data2") val joinedDF = df1FromFile.join(df2FromFile, Seq("count")) if (compressionFactor == 0.5) { val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.nonEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.isEmpty) } else { // compressionFactor is 1.0 val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.isEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.nonEmpty) } } } } } }
Example 5
Source File: RenderHtml.scala From sbt-flaky with Apache License 2.0 | 5 votes |
import java.io.{File, FilenameFilter} import flaky._ import flaky.history.{Git, History, HistoryReport} import org.apache.commons.vfs2.VFS object RenderHtml extends App with Unzip { println("Creating report") private val reportsDir = new File("target/flakyreports") private val dirWithReports = new File("src/test/resources/history8") val log = new DummySbtLogger() private val zipFile: File = dirWithReports .listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = name.endsWith("zip") }).minBy(_.getName) private val projectZip = new File("src/test/resources/gitrepo.zip") private val unzipDir = new File("target/unzipped/") println(s"Unzipping ${zipFile.getPath}") unzip(projectZip, unzipDir) private val projectDir = new File(unzipDir,"gitrepo") private val report = Flaky.createReportFromHistory(VFS.getManager.resolveFile(zipFile.toURI.toString.replace("file:/", "zip:/"))) private val historyReport: HistoryReport = new History("My App", dirWithReports, new File(""), projectDir).createHistoryReport() FlakyCommand.createHtmlReports("My App", report, Some(historyReport), reportsDir, Git(projectDir), log) println(s"Reports created in ${reportsDir.getAbsolutePath}") }
Example 6
Source File: HadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 7
Source File: FileUtils.scala From eidos with Apache License 2.0 | 5 votes |
package org.clulab.wm.wmexchanger.utils import java.io.BufferedInputStream import java.io.BufferedOutputStream import java.io.File import java.io.FileInputStream import java.io.FileOutputStream import java.io.FilenameFilter import java.io.ObjectInputStream import java.io.ObjectOutputStream import java.io.PrintWriter import org.clulab.wm.wmexchanger.utils.Closer.AutoCloser import scala.io.Source object FileUtils { def appendingPrintWriterFromFile(file: File): PrintWriter = Sinker.printWriterFromFile(file, append = true) def appendingPrintWriterFromFile(path: String): PrintWriter = Sinker.printWriterFromFile(path, append = true) def printWriterFromFile(file: File): PrintWriter = Sinker.printWriterFromFile(file, append = false) def printWriterFromFile(path: String): PrintWriter = Sinker.printWriterFromFile(path, append = false) // Output def newBufferedOutputStream(file: File): BufferedOutputStream = new BufferedOutputStream(new FileOutputStream(file)) def newBufferedOutputStream(filename: String): BufferedOutputStream = newBufferedOutputStream(new File(filename)) def newAppendingBufferedOutputStream(file: File): BufferedOutputStream = new BufferedOutputStream(new FileOutputStream(file, true)) def newAppendingBufferedOutputStream(filename: String): BufferedOutputStream = newAppendingBufferedOutputStream(new File(filename)) def newObjectOutputStream(filename: String): ObjectOutputStream = new ObjectOutputStream(newBufferedOutputStream(filename)) // Input def newBufferedInputStream(file: File): BufferedInputStream = new BufferedInputStream(new FileInputStream(file)) def newBufferedInputStream(filename: String): BufferedInputStream = newBufferedInputStream(new File(filename)) def newObjectInputStream(filename: String): ObjectInputStream = new ObjectInputStream(newBufferedInputStream(filename)) def findFiles(collectionDir: String, extension: String): Seq[File] = { val dir = new File(collectionDir) val filter = new FilenameFilter { def accept(dir: File, name: String): Boolean = name.endsWith(extension) } val result = Option(dir.listFiles(filter)) .getOrElse(throw Sourcer.newFileNotFoundException(collectionDir)) result } protected def getTextFromSource(source: Source): String = source.mkString def getTextFromFile(file: File): String = Sourcer.sourceFromFile(file).autoClose { source => getTextFromSource(source) } }
Example 8
Source File: TestDataLoadWithFileName.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.dataload import scala.collection.JavaConverters._ import java.io.{File, FilenameFilter} import org.apache.hadoop.conf.Configuration import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.reader.CarbonIndexFileReader import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.index.Segment import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.{CarbonMetadata, SegmentFileStore} class TestDataLoadWithFileName extends QueryTest with BeforeAndAfterAll { var originVersion = "" override def beforeAll() { originVersion = CarbonProperties.getInstance.getProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION) } test("Check the file_name in carbonindex with v3 format") { CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION, "3") sql("DROP TABLE IF EXISTS test_table_v3") sql( """ | CREATE TABLE test_table_v3(id int, name string, city string, age int) | STORED AS carbondata """.stripMargin) val testData = s"$resourcesPath/sample.csv" sql(s"LOAD DATA LOCAL INPATH '$testData' into table test_table_v3") val indexReader = new CarbonIndexFileReader() val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", "test_table_v3") val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, "0") val carbonIndexPaths = if (FileFactory.isFileExist(segmentDir)) { new File(segmentDir) .listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { name.endsWith(CarbonTablePath.getCarbonIndexExtension) } }) } else { val segment = Segment.getSegment("0", carbonTable.getTablePath) val store = new SegmentFileStore(carbonTable.getTablePath, segment.getSegmentFileName) store.readIndexFiles(new Configuration(false)) store.getIndexCarbonFiles.asScala.map(f => new File(f.getAbsolutePath)).toArray } for (carbonIndexPath <- carbonIndexPaths) { indexReader.openThriftReader(carbonIndexPath.getCanonicalPath) assert(indexReader.readIndexHeader().getVersion === 3) while (indexReader.hasNext) { val readBlockIndexInfo = indexReader.readBlockIndexInfo() assert(readBlockIndexInfo.getFile_name.startsWith(CarbonTablePath.getCarbonDataPrefix)) assert(readBlockIndexInfo.getFile_name.endsWith(CarbonTablePath.getCarbonDataExtension)) } } } override protected def afterAll() { sql("DROP TABLE IF EXISTS test_table_v1") sql("DROP TABLE IF EXISTS test_table_v2") sql("DROP TABLE IF EXISTS test_table_v3") CarbonProperties.getInstance.addProperty(CarbonCommonConstants.CARBON_DATA_FILE_VERSION, originVersion) } }
Example 9
Source File: HadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 10
Source File: GeneratorCommand.scala From json-schema-codegen with Apache License 2.0 | 5 votes |
package json.schema.codegen import java.io.{File, FilenameFilter} import java.nio.file.Path import json.schema.parser.JsonSchemaParser import scalaz.std.AllInstances._ import scalaz.syntax.all._ import scalaz.syntax.std.all._ import scala.collection.convert.WrapAsScala._ import json.source.JsonSource._ abstract class GeneratorCommand(codegens: List[CodeGenerator]) { val jsonFilesFilter = new FilenameFilter { override def accept(dir: File, name: String): Boolean = name.endsWith(".json") } def main(args: Array[String]) { val oargs = args.lift val parser = new JsonSchemaParser[Double] val result = for { source <- oargs(0).map(new File(_)).toRightDisjunction("json-schema is required") targetDir <- oargs(1).map(new File(_)).toRightDisjunction("target folder is required") genRoot: Path = targetDir.toPath sources = if (source.isDirectory) source.listFiles(jsonFilesFilter).toSeq else Seq(source) schemas <- parser.parseAll(sources) results <- codegens.map(gen => gen(schemas)(_ => true, genRoot)).sequenceU } yield results result.fold({ e => sys.error(s"Code generation failed with: $e") System.exit(1) }, _ => System.exit(0)) } }
Example 11
Source File: HadoopFsRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") && !name.startsWith("_") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize)) } } test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") { import testImplicits._ Seq(1.0, 0.5).foreach { compressionFactor => withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString, "spark.sql.autoBroadcastJoinThreshold" -> "400") { withTempPath { workDir => // the file size is 740 bytes val workDirPath = workDir.getAbsolutePath val data1 = Seq(100, 200, 300, 400).toDF("count") data1.write.parquet(workDirPath + "/data1") val df1FromFile = spark.read.parquet(workDirPath + "/data1") val data2 = Seq(100, 200, 300, 400).toDF("count") data2.write.parquet(workDirPath + "/data2") val df2FromFile = spark.read.parquet(workDirPath + "/data2") val joinedDF = df1FromFile.join(df2FromFile, Seq("count")) if (compressionFactor == 0.5) { val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.nonEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.isEmpty) } else { // compressionFactor is 1.0 val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.isEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.nonEmpty) } } } } } }
Example 12
Source File: RunScriptCommand.scala From shellbase with Apache License 2.0 | 5 votes |
package com.sumologic.shellbase.commands import java.io.{File, FilenameFilter} import com.sumologic.shellbase.timeutil.TimedBlock import com.sumologic.shellbase.{ScriptRenderer, ShellBase, ShellCommand} import jline.console.completer.{ArgumentCompleter, Completer, NullCompleter, StringsCompleter} import org.apache.commons.cli.{CommandLine, Options} import scala.collection.JavaConversions._ class RunScriptCommand(scriptDirs: List[File], scriptExtension: String, runCommand: String => Boolean, parseLine: String => List[String] = ShellBase.parseLine) extends ShellCommand("run-script", "Run the script from the specified file.", List("script")) { override def maxNumberOfArguments: Int = -1 def execute(cmdLine: CommandLine): Boolean = { val continue = cmdLine.hasOption("continue") val args: Array[String] = cmdLine.getArgs if (args.length < 1) { println("Please specify a script to run!") return false } val scriptFileName = args(0) val scriptFiles: List[File] = findScripts { List( s"$scriptFileName.$scriptExtension", s"$scriptFileName.dsh", // NOTE(konstantin, 2017-04-02): "dsh" kept for compatibility reasons s"$scriptFileName" ).contains } ++ List(new File(scriptFileName)).filter(f => f.exists && f.isFile && f.canRead) // respect absolute paths too scriptFiles match { case scriptFile :: _ => // Execute the script, line by line. TimedBlock(s"Executing script $scriptFileName", println(_)) { val scriptLines = new ScriptRenderer(scriptFile, args.tail). getLines.filterNot(parseLine(_).isEmpty) require(scriptLines.nonEmpty, s"No non-comment lines found in $scriptFileName") for (line <- scriptLines) { val success = runCommand(line) if (!continue && !success) { return false } } } true case _ => println(s"Could not find the script $scriptFileName! Please make sure the script file exists locally.") false } } override def argCompleter: Completer = { val suffix = s".$scriptExtension" val scriptNames = findScripts(name => scriptExtension == null || name.endsWith(suffix)).map(_.getName) new ArgumentCompleter(List(new StringsCompleter(scriptNames: _*), new NullCompleter)) } private def findScripts(fileNameFilter: String => Boolean): List[File] = for ( scriptDir <- scriptDirs if scriptDir.exists(); file <- scriptDir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = fileNameFilter.apply(name) }) if file.isFile && file.canRead ) yield file override def addOptions(opts: Options): Unit = { opts.addOption("c", "continue", false, "Continue even if there was a failure in execution.") } }