Python pandas.io.parsers.TextFileReader() Examples
The following are 14
code examples of pandas.io.parsers.TextFileReader().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas.io.parsers
, or try the search function
.
Example #1
Source File: parquet.py From boxball with Apache License 2.0 | 6 votes |
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]): """ Writes Parquet version of the chunked dataframe input. Arrow table creation and Parquet-writes take up around 25% of the time on this function. The CSV read takes around 75%. """ rows_processed = 0 for df in df_iterator: rows_processed += min(BUFFER_SIZE_ROWS, len(df)) for col_name in date_cols: df[col_name] = pd.to_datetime(df[col_name], unit="ms") pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema) parquet_writer.write_table(pa_table) print("Rows processed: {}".format(rows_processed), end="\r", flush=True) print()
Example #2
Source File: test_textreader.py From recruit with Apache License 2.0 | 5 votes |
def test_integer_thousands_alt(self): data = '123.456\n12.500' reader = TextFileReader(StringIO(data), delimiter=':', thousands='.', header=None) result = reader.read() expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected)
Example #3
Source File: test_textreader.py From recruit with Apache License 2.0 | 5 votes |
def test_empty_csv_input(self): # GH14867 df = read_csv(StringIO(), chunksize=20, header=None, names=['a', 'b', 'c']) assert isinstance(df, TextFileReader)
Example #4
Source File: test_textreader.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_integer_thousands_alt(self): data = '123.456\n12.500' reader = TextFileReader(StringIO(data), delimiter=':', thousands='.', header=None) result = reader.read() expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected)
Example #5
Source File: test_textreader.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_empty_csv_input(self): # GH14867 df = read_csv(StringIO(), chunksize=20, header=None, names=['a', 'b', 'c']) assert isinstance(df, TextFileReader)
Example #6
Source File: parquet.py From boxball with Apache License 2.0 | 5 votes |
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") pandas_fields = get_pandas_fields(table) arrow_fields = get_arrow_fields(table) arrow_schema = pa.schema(get_arrow_fields(table)) column_names = [name for name, dtype in pandas_fields] date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype] # Using both Arrow and Pandas allows each library to cover the other's current shortcomings. # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes. # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet. in_buf = pa.OSFile(str(extract_file), mode="r") reader = pa.CompressedInputStream(in_buf, compression="zstd") # Have to use snappy codec for Parquet because Drill doesn't read zstd parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy', version="2.0", use_dictionary=True) df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields), true_values=map_to_bytes('T'), false_values=map_to_bytes('F'), chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols) chunked_write(df_iterator, parquet_writer, date_cols)
Example #7
Source File: test_common.py From recruit with Apache License 2.0 | 4 votes |
def test_override_set_noconvert_columns(): # see gh-17351 # # Usecols needs to be sorted in _set_noconvert_columns based # on the test_usecols_with_parse_dates test from test_usecols.py class MyTextFileReader(TextFileReader): def __init__(self): self._currow = 0 self.squeeze = False class MyCParserWrapper(CParserWrapper): def _set_noconvert_columns(self): if self.usecols_dtype == "integer": # self.usecols is a set, which is documented as unordered # but in practice, a CPython set of integers is sorted. # In other implementations this assumption does not hold. # The following code simulates a different order, which # before GH 17351 would cause the wrong columns to be # converted via the parse_dates parameter self.usecols = list(self.usecols) self.usecols.reverse() return CParserWrapper._set_noconvert_columns(self) data = """a,b,c,d,e 0,1,20140101,0900,4 0,1,20140102,1000,4""" parse_dates = [[1, 2]] cols = { "a": [0, 0], "c_d": [ Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00") ] } expected = DataFrame(cols, columns=["c_d", "a"]) parser = MyTextFileReader() parser.options = {"usecols": [0, 2, 3], "parse_dates": parse_dates, "delimiter": ","} parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() tm.assert_frame_equal(result, expected)
Example #8
Source File: common.py From vnpy_crypto with MIT License | 4 votes |
def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) assert isinstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) assert len(result) == 3 tm.assert_frame_equal(pd.concat(result), expected) # skipfooter is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skipfooter=1) pytest.raises(ValueError, reader.read, 3)
Example #9
Source File: test_parsers.py From vnpy_crypto with MIT License | 4 votes |
def test_override__set_noconvert_columns(self): # GH 17351 - usecols needs to be sorted in _setnoconvert_columns # based on the test_usecols_with_parse_dates test from usecols.py from pandas.io.parsers import CParserWrapper, TextFileReader s = """a,b,c,d,e 0,1,20140101,0900,4 0,1,20140102,1000,4""" parse_dates = [[1, 2]] cols = { 'a': [0, 0], 'c_d': [ Timestamp('2014-01-01 09:00:00'), Timestamp('2014-01-02 10:00:00') ] } expected = DataFrame(cols, columns=['c_d', 'a']) class MyTextFileReader(TextFileReader): def __init__(self): self._currow = 0 self.squeeze = False class MyCParserWrapper(CParserWrapper): def _set_noconvert_columns(self): if self.usecols_dtype == 'integer': # self.usecols is a set, which is documented as unordered # but in practice, a CPython set of integers is sorted. # In other implementations this assumption does not hold. # The following code simulates a different order, which # before GH 17351 would cause the wrong columns to be # converted via the parse_dates parameter self.usecols = list(self.usecols) self.usecols.reverse() return CParserWrapper._set_noconvert_columns(self) parser = MyTextFileReader() parser.options = {'usecols': [0, 2, 3], 'parse_dates': parse_dates, 'delimiter': ','} parser._engine = MyCParserWrapper(StringIO(s), **parser.options) df = parser.read() tm.assert_frame_equal(df, expected)
Example #10
Source File: test_common.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 4 votes |
def test_override_set_noconvert_columns(): # see gh-17351 # # Usecols needs to be sorted in _set_noconvert_columns based # on the test_usecols_with_parse_dates test from test_usecols.py class MyTextFileReader(TextFileReader): def __init__(self): self._currow = 0 self.squeeze = False class MyCParserWrapper(CParserWrapper): def _set_noconvert_columns(self): if self.usecols_dtype == "integer": # self.usecols is a set, which is documented as unordered # but in practice, a CPython set of integers is sorted. # In other implementations this assumption does not hold. # The following code simulates a different order, which # before GH 17351 would cause the wrong columns to be # converted via the parse_dates parameter self.usecols = list(self.usecols) self.usecols.reverse() return CParserWrapper._set_noconvert_columns(self) data = """a,b,c,d,e 0,1,20140101,0900,4 0,1,20140102,1000,4""" parse_dates = [[1, 2]] cols = { "a": [0, 0], "c_d": [ Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00") ] } expected = DataFrame(cols, columns=["c_d", "a"]) parser = MyTextFileReader() parser.options = {"usecols": [0, 2, 3], "parse_dates": parse_dates, "delimiter": ","} parser._engine = MyCParserWrapper(StringIO(data), **parser.options) result = parser.read() tm.assert_frame_equal(result, expected)
Example #11
Source File: common.py From elasticintel with GNU General Public License v3.0 | 4 votes |
def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) assert isinstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) assert len(result) == 3 tm.assert_frame_equal(pd.concat(result), expected) # skipfooter is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skipfooter=1) pytest.raises(ValueError, reader.read, 3)
Example #12
Source File: test_parsers.py From elasticintel with GNU General Public License v3.0 | 4 votes |
def test_override__set_noconvert_columns(self): # GH 17351 - usecols needs to be sorted in _setnoconvert_columns # based on the test_usecols_with_parse_dates test from usecols.py from pandas.io.parsers import CParserWrapper, TextFileReader s = """a,b,c,d,e 0,1,20140101,0900,4 0,1,20140102,1000,4""" parse_dates = [[1, 2]] cols = { 'a': [0, 0], 'c_d': [ Timestamp('2014-01-01 09:00:00'), Timestamp('2014-01-02 10:00:00') ] } expected = DataFrame(cols, columns=['c_d', 'a']) class MyTextFileReader(TextFileReader): def __init__(self): self._currow = 0 self.squeeze = False class MyCParserWrapper(CParserWrapper): def _set_noconvert_columns(self): if self.usecols_dtype == 'integer': # self.usecols is a set, which is documented as unordered # but in practice, a CPython set of integers is sorted. # In other implementations this assumption does not hold. # The following code simulates a different order, which # before GH 17351 would cause the wrong columns to be # converted via the parse_dates parameter self.usecols = list(self.usecols) self.usecols.reverse() return CParserWrapper._set_noconvert_columns(self) parser = MyTextFileReader() parser.options = {'usecols': [0, 2, 3], 'parse_dates': parse_dates, 'delimiter': ','} parser._engine = MyCParserWrapper(StringIO(s), **parser.options) df = parser.read() tm.assert_frame_equal(df, expected)
Example #13
Source File: common.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) assert isinstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) assert len(result) == 3 tm.assert_frame_equal(pd.concat(result), expected) # skipfooter is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skipfooter=1) pytest.raises(ValueError, reader.read, 3)
Example #14
Source File: test_parsers.py From twitter-stock-recommendation with MIT License | 4 votes |
def test_override__set_noconvert_columns(self): # GH 17351 - usecols needs to be sorted in _setnoconvert_columns # based on the test_usecols_with_parse_dates test from usecols.py from pandas.io.parsers import CParserWrapper, TextFileReader s = """a,b,c,d,e 0,1,20140101,0900,4 0,1,20140102,1000,4""" parse_dates = [[1, 2]] cols = { 'a': [0, 0], 'c_d': [ Timestamp('2014-01-01 09:00:00'), Timestamp('2014-01-02 10:00:00') ] } expected = DataFrame(cols, columns=['c_d', 'a']) class MyTextFileReader(TextFileReader): def __init__(self): self._currow = 0 self.squeeze = False class MyCParserWrapper(CParserWrapper): def _set_noconvert_columns(self): if self.usecols_dtype == 'integer': # self.usecols is a set, which is documented as unordered # but in practice, a CPython set of integers is sorted. # In other implementations this assumption does not hold. # The following code simulates a different order, which # before GH 17351 would cause the wrong columns to be # converted via the parse_dates parameter self.usecols = list(self.usecols) self.usecols.reverse() return CParserWrapper._set_noconvert_columns(self) parser = MyTextFileReader() parser.options = {'usecols': [0, 2, 3], 'parse_dates': parse_dates, 'delimiter': ','} parser._engine = MyCParserWrapper(StringIO(s), **parser.options) df = parser.read() tm.assert_frame_equal(df, expected)