Python Examples of pandas.io.parsers.TextFileReader

Source File: parquet.py From boxball with Apache License 2.0

6 votes

def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
    print()

Source File: test_textreader.py From recruit with Apache License 2.0

5 votes

def test_integer_thousands_alt(self):
        data = '123.456\n12.500'

        reader = TextFileReader(StringIO(data), delimiter=':',
                                thousands='.', header=None)
        result = reader.read()

        expected = DataFrame([123456, 12500])
        tm.assert_frame_equal(result, expected)

Source File: test_textreader.py From recruit with Apache License 2.0

5 votes

def test_empty_csv_input(self):
        # GH14867
        df = read_csv(StringIO(), chunksize=20, header=None,
                      names=['a', 'b', 'c'])
        assert isinstance(df, TextFileReader)

Source File: test_textreader.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_integer_thousands_alt(self):
        data = '123.456\n12.500'

        reader = TextFileReader(StringIO(data), delimiter=':',
                                thousands='.', header=None)
        result = reader.read()

        expected = DataFrame([123456, 12500])
        tm.assert_frame_equal(result, expected)

Source File: test_textreader.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_empty_csv_input(self):
        # GH14867
        df = read_csv(StringIO(), chunksize=20, header=None,
                      names=['a', 'b', 'c'])
        assert isinstance(df, TextFileReader)

Source File: parquet.py From boxball with Apache License 2.0

5 votes

def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy',
                                          version="2.0", use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields),
                                                  true_values=map_to_bytes('T'), false_values=map_to_bytes('F'),
                                                  chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols)

Source File: test_common.py From recruit with Apache License 2.0

4 votes

def test_override_set_noconvert_columns():
    # see gh-17351
    #
    # Usecols needs to be sorted in _set_noconvert_columns based
    # on the test_usecols_with_parse_dates test from test_usecols.py
    class MyTextFileReader(TextFileReader):
        def __init__(self):
            self._currow = 0
            self.squeeze = False

    class MyCParserWrapper(CParserWrapper):
        def _set_noconvert_columns(self):
            if self.usecols_dtype == "integer":
                # self.usecols is a set, which is documented as unordered
                # but in practice, a CPython set of integers is sorted.
                # In other implementations this assumption does not hold.
                # The following code simulates a different order, which
                # before GH 17351 would cause the wrong columns to be
                # converted via the parse_dates parameter
                self.usecols = list(self.usecols)
                self.usecols.reverse()
            return CParserWrapper._set_noconvert_columns(self)

    data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""

    parse_dates = [[1, 2]]
    cols = {
        "a": [0, 0],
        "c_d": [
            Timestamp("2014-01-01 09:00:00"),
            Timestamp("2014-01-02 10:00:00")
        ]
    }
    expected = DataFrame(cols, columns=["c_d", "a"])

    parser = MyTextFileReader()
    parser.options = {"usecols": [0, 2, 3],
                      "parse_dates": parse_dates,
                      "delimiter": ","}
    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)

    result = parser.read()
    tm.assert_frame_equal(result, expected)

Source File: common.py From vnpy_crypto with MIT License

4 votes

def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1), index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
                                  iterator=True)
        assert isinstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        assert len(result) == 3
        tm.assert_frame_equal(pd.concat(result), expected)

        # skipfooter is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skipfooter)
            reader = self.read_csv(StringIO(self.data1), index_col=0,
                                   iterator=True, skipfooter=1)
            pytest.raises(ValueError, reader.read, 3)

Source File: test_parsers.py From vnpy_crypto with MIT License

4 votes

def test_override__set_noconvert_columns(self):
        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
        # based on the test_usecols_with_parse_dates test from usecols.py
        from pandas.io.parsers import CParserWrapper, TextFileReader

        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""

        parse_dates = [[1, 2]]
        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        class MyTextFileReader(TextFileReader):
            def __init__(self):
                self._currow = 0
                self.squeeze = False

        class MyCParserWrapper(CParserWrapper):
            def _set_noconvert_columns(self):
                if self.usecols_dtype == 'integer':
                    # self.usecols is a set, which is documented as unordered
                    # but in practice, a CPython set of integers is sorted.
                    # In other implementations this assumption does not hold.
                    # The following code simulates a different order, which
                    # before GH 17351 would cause the wrong columns to be
                    # converted via the parse_dates parameter
                    self.usecols = list(self.usecols)
                    self.usecols.reverse()
                return CParserWrapper._set_noconvert_columns(self)

        parser = MyTextFileReader()
        parser.options = {'usecols': [0, 2, 3],
                          'parse_dates': parse_dates,
                          'delimiter': ','}
        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
        df = parser.read()

        tm.assert_frame_equal(df, expected)

Source File: test_common.py From predictive-maintenance-using-machine-learning with Apache License 2.0

4 votes

def test_override_set_noconvert_columns():
    # see gh-17351
    #
    # Usecols needs to be sorted in _set_noconvert_columns based
    # on the test_usecols_with_parse_dates test from test_usecols.py
    class MyTextFileReader(TextFileReader):
        def __init__(self):
            self._currow = 0
            self.squeeze = False

    class MyCParserWrapper(CParserWrapper):
        def _set_noconvert_columns(self):
            if self.usecols_dtype == "integer":
                # self.usecols is a set, which is documented as unordered
                # but in practice, a CPython set of integers is sorted.
                # In other implementations this assumption does not hold.
                # The following code simulates a different order, which
                # before GH 17351 would cause the wrong columns to be
                # converted via the parse_dates parameter
                self.usecols = list(self.usecols)
                self.usecols.reverse()
            return CParserWrapper._set_noconvert_columns(self)

    data = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""

    parse_dates = [[1, 2]]
    cols = {
        "a": [0, 0],
        "c_d": [
            Timestamp("2014-01-01 09:00:00"),
            Timestamp("2014-01-02 10:00:00")
        ]
    }
    expected = DataFrame(cols, columns=["c_d", "a"])

    parser = MyTextFileReader()
    parser.options = {"usecols": [0, 2, 3],
                      "parse_dates": parse_dates,
                      "delimiter": ","}
    parser._engine = MyCParserWrapper(StringIO(data), **parser.options)

    result = parser.read()
    tm.assert_frame_equal(result, expected)

Source File: common.py From elasticintel with GNU General Public License v3.0

4 votes

def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1), index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
                                  iterator=True)
        assert isinstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        assert len(result) == 3
        tm.assert_frame_equal(pd.concat(result), expected)

        # skipfooter is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skipfooter)
            reader = self.read_csv(StringIO(self.data1), index_col=0,
                                   iterator=True, skipfooter=1)
            pytest.raises(ValueError, reader.read, 3)

Source File: test_parsers.py From elasticintel with GNU General Public License v3.0

4 votes

def test_override__set_noconvert_columns(self):
        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
        # based on the test_usecols_with_parse_dates test from usecols.py
        from pandas.io.parsers import CParserWrapper, TextFileReader

        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""

        parse_dates = [[1, 2]]
        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        class MyTextFileReader(TextFileReader):
            def __init__(self):
                self._currow = 0
                self.squeeze = False

        class MyCParserWrapper(CParserWrapper):
            def _set_noconvert_columns(self):
                if self.usecols_dtype == 'integer':
                    # self.usecols is a set, which is documented as unordered
                    # but in practice, a CPython set of integers is sorted.
                    # In other implementations this assumption does not hold.
                    # The following code simulates a different order, which
                    # before GH 17351 would cause the wrong columns to be
                    # converted via the parse_dates parameter
                    self.usecols = list(self.usecols)
                    self.usecols.reverse()
                return CParserWrapper._set_noconvert_columns(self)

        parser = MyTextFileReader()
        parser.options = {'usecols': [0, 2, 3],
                          'parse_dates': parse_dates,
                          'delimiter': ','}
        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
        df = parser.read()

        tm.assert_frame_equal(df, expected)

Source File: common.py From twitter-stock-recommendation with MIT License

4 votes

def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1), index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
                                  iterator=True)
        assert isinstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        assert len(result) == 3
        tm.assert_frame_equal(pd.concat(result), expected)

        # skipfooter is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skipfooter)
            reader = self.read_csv(StringIO(self.data1), index_col=0,
                                   iterator=True, skipfooter=1)
            pytest.raises(ValueError, reader.read, 3)

Source File: test_parsers.py From twitter-stock-recommendation with MIT License

4 votes

def test_override__set_noconvert_columns(self):
        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
        # based on the test_usecols_with_parse_dates test from usecols.py
        from pandas.io.parsers import CParserWrapper, TextFileReader

        s = """a,b,c,d,e
        0,1,20140101,0900,4
        0,1,20140102,1000,4"""

        parse_dates = [[1, 2]]
        cols = {
            'a': [0, 0],
            'c_d': [
                Timestamp('2014-01-01 09:00:00'),
                Timestamp('2014-01-02 10:00:00')
            ]
        }
        expected = DataFrame(cols, columns=['c_d', 'a'])

        class MyTextFileReader(TextFileReader):
            def __init__(self):
                self._currow = 0
                self.squeeze = False

        class MyCParserWrapper(CParserWrapper):
            def _set_noconvert_columns(self):
                if self.usecols_dtype == 'integer':
                    # self.usecols is a set, which is documented as unordered
                    # but in practice, a CPython set of integers is sorted.
                    # In other implementations this assumption does not hold.
                    # The following code simulates a different order, which
                    # before GH 17351 would cause the wrong columns to be
                    # converted via the parse_dates parameter
                    self.usecols = list(self.usecols)
                    self.usecols.reverse()
                return CParserWrapper._set_noconvert_columns(self)

        parser = MyTextFileReader()
        parser.options = {'usecols': [0, 2, 3],
                          'parse_dates': parse_dates,
                          'delimiter': ','}
        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
        df = parser.read()

        tm.assert_frame_equal(df, expected)

Python pandas.io.parsers.TextFileReader() Examples