Python pyarrow.string() Examples

The following are 30 code examples of pyarrow.string(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: test_types.py    From cjworkbench with GNU Affero General Public License v3.0 6 votes vote down vote up
def test_dataframe_all_null_category_column(self):
        assert_arrow_table_equals(
            dataframe_to_arrow_table(
                pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
                [Column("A", ColumnType.TEXT())],
                self.path,
            ),
            arrow_table(
                {
                    "A": pyarrow.DictionaryArray.from_arrays(
                        pyarrow.array([None], type=pyarrow.int8()),
                        pyarrow.array([], type=pyarrow.string()),
                    )
                }
            ),
        ) 
Example #2
Source File: base.py    From fletcher with MIT License 6 votes vote down vote up
def __eq__(self, other) -> bool:
        """Check whether 'other' is equal to self.

        By default, 'other' is considered equal if
        * it's a string matching 'self.name'.
        * it's an instance of this type.

        Parameters
        ----------
        other : Any

        Returns
        -------
        bool
        """
        if isinstance(other, str):
            return other == self.name
        elif isinstance(other, type(self)):
            return self.arrow_dtype == other.arrow_dtype
        else:
            return False 
Example #3
Source File: test_numba_integration.py    From fletcher with MIT License 6 votes vote down vote up
def test_string_builder_simple(data):
    builder = NumbaStringArrayBuilder(2, 6)

    for s in data:
        if s is None:
            builder.finish_null()
            continue

        for c in s:
            builder.put_byte(ord(c))

        builder.finish_string()

    builder.finish()

    expected = pa.array(data, pa.string())
    missing, offsets, data = buffers_as_arrays(expected)

    np.testing.assert_array_equal(builder.offsets, offsets)
    np.testing.assert_array_equal(builder.data, data) 
Example #4
Source File: types.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def __init__(self, name, dataType, nullable=True, metadata=None):
        """
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f1", StringType(), True))
        True
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f2", StringType(), True))
        False
        """
        assert isinstance(dataType, DataType),\
            "dataType %s should be an instance of %s" % (dataType, DataType)
        assert isinstance(name, basestring), "field name %s should be string" % (name)
        if not isinstance(name, str):
            name = name.encode('utf-8')
        self.name = name
        self.dataType = dataType
        self.nullable = nullable
        self.metadata = metadata or {} 
Example #5
Source File: test_text.py    From fletcher with MIT License 6 votes vote down vote up
def test_text_zfill(data, fletcher_variant):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    max_str_len = ser_pd.map(_optional_len).max()
    if pd.isna(max_str_len):
        max_str_len = 0
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)

    result_pd = ser_pd.str.zfill(max_str_len + 1)
    result_fr = ser_fr.fr_text.zfill(max_str_len + 1)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd) 
Example #6
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
    dict_schema = [
        {"name": "field01", "type": "STRING", "mode": "REQUIRED"},
        {"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
    ]

    dataframe = pandas.DataFrame(
        {"field01": [u"hello", u"world"], "field02": [True, False]}
    )

    arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema)
    arrow_schema = arrow_table.schema

    expected_fields = [
        pyarrow.field("field01", "string", nullable=False),
        pyarrow.field("field02", "bool", nullable=True),
    ]
    assert list(arrow_schema) == expected_fields 
Example #7
Source File: string.py    From fletcher with MIT License 6 votes vote down vote up
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
    if len(a) != len(b):
        raise ValueError("Lengths of arrays don't match")

    offsets_a, data_a = _extract_string_buffers(a)
    offsets_b, data_b = _extract_string_buffers(b)
    if len(a) > 0:
        valid = _merge_valid_bitmaps(a, b)
        result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a 
Example #8
Source File: parquet_util.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def get_pa_translated_schema(self):
        """Translates a BigQuery schema to an parquet schema.

        Returns: Translated parquet schema in pyarrow.Schema format.
        """

        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
        }

        # TODO(annarudy@google.com): add support for nested fields
        pa_schema_list = [
            pa.field(
                bq_field.name,
                type_conversions[bq_field.field_type],
            ) for bq_field in self.bq_schema
        ]

        return pa.schema(pa_schema_list) 
Example #9
Source File: test_db.py    From aws-data-wrangler with Apache License 2.0 6 votes vote down vote up
def test_redshift_spectrum_long_string(path, glue_table, glue_database, redshift_external_schema):
    df = pd.DataFrame(
        {
            "id": [1, 2],
            "col_str": [
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
            ],
        }
    )
    paths = wr.s3.to_parquet(
        df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True
    )["paths"]
    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
    with engine.connect() as con:
        cursor = con.execute(f"SELECT * FROM {redshift_external_schema}.{glue_table}")
        rows = cursor.fetchall()
        assert len(rows) == len(df.index)
        for row in rows:
            assert len(row) == len(df.columns) 
Example #10
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_num_bytes_getter(self):
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        table = self._make_one(table_ref)

        # Check with no value set.
        self.assertIsNone(table.num_bytes)

        num_bytes = 1337
        # Check with integer value set.
        table._properties = {"numBytes": num_bytes}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with a string value set.
        table._properties = {"numBytes": str(num_bytes)}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with invalid int value.
        table._properties = {"numBytes": "x"}
        with self.assertRaises(ValueError):
            getattr(table, "num_bytes") 
Example #11
Source File: dataset.py    From kartothek with MIT License 6 votes vote down vote up
def load_from_buffer(
        buf, store: KeyValueStore, format: str = "json"
    ) -> "DatasetMetadata":
        """
        Load a dataset from a (string) buffer.

        Parameters
        ----------
        buf:
            Input to be parsed.
        store:
            Object that implements the .get method for file/object loading.

        Returns
        -------
        dataset_metadata:
            Parsed metadata.
        """
        if format == "json":
            metadata = load_json(buf)
        elif format == "msgpack":
            metadata = msgpack.unpackb(buf)
        return DatasetMetadata.load_from_dict(metadata, store) 
Example #12
Source File: test_numba_integration.py    From fletcher with MIT License 5 votes vote down vote up
def test_str_length(array, expected, offset):
    array = pa.array(array, pa.string())[offset:]
    np.testing.assert_array_equal(
        str_length(NumbaStringArray.make(array)),  # type: ignore
        np.asarray(expected[offset:], dtype=np.int32),
    ) 
Example #13
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def coerce(cls, value: mtypes.Message) -> I18nMessage:
        """ Convert an internationalized message as returned from modules to an object of this dataclass.
        
        Raises:
        - ValueError, if the value is a list of the wrong length or if the value is of a non-supported type
        
        """
        if isinstance(value, str):
            return cls.TODO_i18n(value)
        elif isinstance(value, tuple):
            if len(value) < 2 or len(value) > 3:
                raise ValueError(
                    "This tuple cannot be coerced to I18nMessage: %s" % value
                )
            if not isinstance(value[0], str):
                raise ValueError(
                    "Message ID must be string, got %s" % type(value[0]).__name__
                )
            if not isinstance(value[1], dict):
                raise ValueError(
                    "Message arguments must be a dict, got %s" % type(value[1]).__name__
                )
            if len(value) == 3:
                source = value[2]
                if source not in ["module", "cjwmodule", "cjwparse", None]:
                    raise ValueError("Invalid i18n message source %r" % source)
            else:
                source = None
            return cls(value[0], value[1], source)
        else:
            raise ValueError(
                "%s is of type %s, which cannot be coerced to I18nMessage"
                % (value, type(value).__name__)
            ) 
Example #14
Source File: test_types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_dataframe_all_null_text_column(self):
        assert_arrow_table_equals(
            dataframe_to_arrow_table(
                pd.DataFrame({"A": [None]}, dtype=str),
                [Column("A", ColumnType.TEXT())],
                self.path,
            ),
            arrow_table({"A": pyarrow.array([None], pyarrow.string())}),
        ) 
Example #15
Source File: test_types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_coerce_from_string(self):
        self.assertEqual(
            I18nMessage.coerce("some string"),
            I18nMessage("TODO_i18n", {"text": "some string"}),
        ) 
Example #16
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype) 
Example #17
Source File: parquet_pio.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ] 
Example #18
Source File: test_types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_from_string(self):
        self.assertEqual(
            ProcessResultError.coerce("some string"),
            ProcessResultError(I18nMessage.TODO_i18n("some string")),
        ) 
Example #19
Source File: csv_ext.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def to_varname(string):
    """Converts string to correct Python variable name.
    Replaces unavailable symbols with _ and insert _ if string starts with digit.
    """
    import re
    return re.sub(r'\W|^(?=\d)','_', string) 
Example #20
Source File: parquet_pio.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def generic(self, args, kws):
        assert not kws
        assert len(args) == 4
        if args[2] == types.intp:  # string read call, returns string array
            return signature(string_array_type, *unliteral_all(args))
        # array_ty = types.Array(ndim=1, layout='C', dtype=args[2])
        return signature(types.int64, *unliteral_all(args)) 
Example #21
Source File: base.py    From fletcher with MIT License 5 votes vote down vote up
def __str__(self) -> str:
        """Convert to string."""
        return f"fletcher_chunked[{self.arrow_dtype}]" 
Example #22
Source File: benchmarks.py    From fletcher with MIT License 5 votes vote down vote up
def setup(self):
        array = generate_test_array_non_null(2 ** 17)
        self.df = pd.DataFrame({"str": array})
        self.df_ext = pd.DataFrame(
            {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))}
        ) 
Example #23
Source File: test_pandas_integration.py    From fletcher with MIT License 5 votes vote down vote up
def test_concatenate_blocks():
    v1 = fr.FletcherChunkedArray(TEST_ARRAY)
    s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True)
    result = pd.concat([s, s], ignore_index=True)
    expected = pd.Series(
        fr.FletcherChunkedArray(
            pa.array(["Test", "string", None, "Test", "string", None])
        )
    )
    tm.assert_series_equal(result, expected)


# ----------------------------------------------------------------------------
# Public Constructors
# ---------------------------------------------------------------------------- 
Example #24
Source File: benchmarks.py    From fletcher with MIT License 5 votes vote down vote up
def setup(self):
        array = generate_test_array(2 ** 17)
        self.df = pd.DataFrame({"str": array})
        self.df_ext = pd.DataFrame(
            {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))}
        ) 
Example #25
Source File: test_text.py    From fletcher with MIT License 5 votes vote down vote up
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()):
    arrow_data = pa.array(data, type=dtype)
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    return pd.Series(fr_array) 
Example #26
Source File: benchmarks.py    From fletcher with MIT License 5 votes vote down vote up
def setup(self, chunked, value, indices):
        # assert np.isscalar(values) or len(values) == len(indices)
        array = generate_test_array(self.n)
        if indices == "int":
            if value == "array_value":
                raise NotImplementedError()
            self.indexer = 50
        elif indices == "int_array":
            self.indexer = list(range(0, self.n, 5))
        elif indices == "bool_array":
            self.indexer = np.zeros(self.n, dtype=bool)
            self.indexer[list(range(0, self.n, 5))] = True
        elif indices == "slice":
            self.indexer = slice(0, self.n, 5)

        if value == "scalar_value":
            self.value = "setitem"
        elif value == "array_value":
            self.value = [str(x) for x in range(self.n)]
            self.value = np.array(self.value)[self.indexer]
            if len(self.value) == 1:
                self.value = self.value[0]

        self.df = pd.DataFrame({"str": array})
        if chunked:
            array = np.array_split(array, 1000)
        else:
            array = [array]
        self.df_ext = pd.DataFrame(
            {
                "str": fr.FletcherChunkedArray(
                    pa.chunked_array([pa.array(chunk, pa.string()) for chunk in array])
                )
            }
        ) 
Example #27
Source File: test_pandas_integration.py    From fletcher with MIT License 5 votes vote down vote up
def test_nbytes():
    array = fr.FletcherChunkedArray(pa.array(["A", None, "CC"]))
    # Minimal storage usage:
    # 1 byte for the valid bitmap
    # 4 bytes for the offset array
    # 3 bytes for the actual string content
    assert array.nbytes >= 8 
Example #28
Source File: test_pandas_integration.py    From fletcher with MIT License 5 votes vote down vote up
def test_getitem_scalar():
    ser = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY))
    result = ser[1]
    assert result == "string" 
Example #29
Source File: test_io.py    From fletcher with MIT License 5 votes vote down vote up
def test_read_parquet(tmpdir, continuous):
    str_arr = pa.array(["a", None, "c"], pa.string())
    int_arr = pa.array([1, None, -2], pa.int32())
    bool_arr = pa.array([True, None, False], pa.bool_())
    table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])

    pq.write_table(table, "df.parquet")
    result = fr.read_parquet("df.parquet", continuous=continuous)
    expected = fr.pandas_from_arrow(table, continuous=continuous)
    tm.assert_frame_equal(result, expected) 
Example #30
Source File: test_base.py    From fletcher with MIT License 5 votes vote down vote up
def array_inhom_chunks():
    chunk1 = pa.array(list("abc"), pa.string())
    chunk2 = pa.array(list("12345"), pa.string())
    chunk3 = pa.array(list("Z"), pa.string())
    chunked_array = pa.chunked_array([chunk1, chunk2, chunk3])
    return fr.FletcherChunkedArray(chunked_array)