Python Examples of pyarrow.string

Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0

6 votes

def test_dataframe_all_null_category_column(self):
        assert_arrow_table_equals(
            dataframe_to_arrow_table(
                pd.DataFrame({"A": [None]}, dtype=str).astype("category"),
                [Column("A", ColumnType.TEXT())],
                self.path,
            ),
            arrow_table(
                {
                    "A": pyarrow.DictionaryArray.from_arrays(
                        pyarrow.array([None], type=pyarrow.int8()),
                        pyarrow.array([], type=pyarrow.string()),
                    )
                }
            ),
        )

Source File: base.py From fletcher with MIT License

6 votes

def __eq__(self, other) -> bool:
        """Check whether 'other' is equal to self.

        By default, 'other' is considered equal if
        * it's a string matching 'self.name'.
        * it's an instance of this type.

        Parameters
        ----------
        other : Any

        Returns
        -------
        bool
        """
        if isinstance(other, str):
            return other == self.name
        elif isinstance(other, type(self)):
            return self.arrow_dtype == other.arrow_dtype
        else:
            return False

Source File: test_numba_integration.py From fletcher with MIT License

6 votes

def test_string_builder_simple(data):
    builder = NumbaStringArrayBuilder(2, 6)

    for s in data:
        if s is None:
            builder.finish_null()
            continue

        for c in s:
            builder.put_byte(ord(c))

        builder.finish_string()

    builder.finish()

    expected = pa.array(data, pa.string())
    missing, offsets, data = buffers_as_arrays(expected)

    np.testing.assert_array_equal(builder.offsets, offsets)
    np.testing.assert_array_equal(builder.data, data)

Source File: types.py From LearningApacheSpark with MIT License

6 votes

def __init__(self, name, dataType, nullable=True, metadata=None):
        """
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f1", StringType(), True))
        True
        >>> (StructField("f1", StringType(), True)
        ...      == StructField("f2", StringType(), True))
        False
        """
        assert isinstance(dataType, DataType),\
            "dataType %s should be an instance of %s" % (dataType, DataType)
        assert isinstance(name, basestring), "field name %s should be string" % (name)
        if not isinstance(name, str):
            name = name.encode('utf-8')
        self.name = name
        self.dataType = dataType
        self.nullable = nullable
        self.metadata = metadata or {}

Source File: test_text.py From fletcher with MIT License

6 votes

def test_text_zfill(data, fletcher_variant):
    if any("\x00" in x for x in data if x):
        # pytest.skip("pandas cannot handle \\x00 characters in tests")
        # Skip is not working properly with hypothesis
        return
    ser_pd = pd.Series(data, dtype=str)
    max_str_len = ser_pd.map(_optional_len).max()
    if pd.isna(max_str_len):
        max_str_len = 0
    arrow_data = pa.array(data, type=pa.string())
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    ser_fr = pd.Series(fr_array)

    result_pd = ser_pd.str.zfill(max_str_len + 1)
    result_fr = ser_fr.fr_text.zfill(max_str_len + 1)
    result_fr = result_fr.astype(object)
    # Pandas returns np.nan for NA values in cat, keep this in line
    result_fr[result_fr.isna()] = np.nan
    tm.assert_series_equal(result_fr, result_pd)

Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0

6 votes

def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):
    dict_schema = [
        {"name": "field01", "type": "STRING", "mode": "REQUIRED"},
        {"name": "field02", "type": "BOOL", "mode": "NULLABLE"},
    ]

    dataframe = pandas.DataFrame(
        {"field01": [u"hello", u"world"], "field02": [True, False]}
    )

    arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema)
    arrow_schema = arrow_table.schema

    expected_fields = [
        pyarrow.field("field01", "string", nullable=False),
        pyarrow.field("field02", "bool", nullable=True),
    ]
    assert list(arrow_schema) == expected_fields

Source File: string.py From fletcher with MIT License

6 votes

def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array:
    if len(a) != len(b):
        raise ValueError("Lengths of arrays don't match")

    offsets_a, data_a = _extract_string_buffers(a)
    offsets_b, data_b = _extract_string_buffers(b)
    if len(a) > 0:
        valid = _merge_valid_bitmaps(a, b)
        result_offsets = np.empty(len(a) + 1, dtype=np.int32)
        result_offsets[0] = 0
        total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0])
        result_data = np.empty(total_size, dtype=np.uint8)
        _merge_string_data(
            len(a),
            valid,
            offsets_a,
            data_a,
            offsets_b,
            data_b,
            result_offsets,
            result_data,
        )
        buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]]
        return pa.Array.from_buffers(pa.string(), len(a), buffers)
    return a

Source File: parquet_util.py From professional-services with Apache License 2.0

6 votes

def get_pa_translated_schema(self):
        """Translates a BigQuery schema to an parquet schema.

        Returns: Translated parquet schema in pyarrow.Schema format.
        """

        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
        }

        # TODO(annarudy@google.com): add support for nested fields
        pa_schema_list = [
            pa.field(
                bq_field.name,
                type_conversions[bq_field.field_type],
            ) for bq_field in self.bq_schema
        ]

        return pa.schema(pa_schema_list)

Source File: test_db.py From aws-data-wrangler with Apache License 2.0

6 votes

def test_redshift_spectrum_long_string(path, glue_table, glue_database, redshift_external_schema):
    df = pd.DataFrame(
        {
            "id": [1, 2],
            "col_str": [
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
                "".join(random.choice(string.ascii_letters) for _ in range(300)),
            ],
        }
    )
    paths = wr.s3.to_parquet(
        df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True
    )["paths"]
    wr.s3.wait_objects_exist(paths=paths, use_threads=False)
    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
    with engine.connect() as con:
        cursor = con.execute(f"SELECT * FROM {redshift_external_schema}.{glue_table}")
        rows = cursor.fetchall()
        assert len(rows) == len(df.index)
        for row in rows:
            assert len(row) == len(df.columns)

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_num_bytes_getter(self):
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        table = self._make_one(table_ref)

        # Check with no value set.
        self.assertIsNone(table.num_bytes)

        num_bytes = 1337
        # Check with integer value set.
        table._properties = {"numBytes": num_bytes}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with a string value set.
        table._properties = {"numBytes": str(num_bytes)}
        self.assertEqual(table.num_bytes, num_bytes)

        # Check with invalid int value.
        table._properties = {"numBytes": "x"}
        with self.assertRaises(ValueError):
            getattr(table, "num_bytes")

Source File: dataset.py From kartothek with MIT License

6 votes

def load_from_buffer(
        buf, store: KeyValueStore, format: str = "json"
    ) -> "DatasetMetadata":
        """
        Load a dataset from a (string) buffer.

        Parameters
        ----------
        buf:
            Input to be parsed.
        store:
            Object that implements the .get method for file/object loading.

        Returns
        -------
        dataset_metadata:
            Parsed metadata.
        """
        if format == "json":
            metadata = load_json(buf)
        elif format == "msgpack":
            metadata = msgpack.unpackb(buf)
        return DatasetMetadata.load_from_dict(metadata, store)

Source File: test_numba_integration.py From fletcher with MIT License

5 votes

def test_str_length(array, expected, offset):
    array = pa.array(array, pa.string())[offset:]
    np.testing.assert_array_equal(
        str_length(NumbaStringArray.make(array)),  # type: ignore
        np.asarray(expected[offset:], dtype=np.int32),
    )

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def coerce(cls, value: mtypes.Message) -> I18nMessage:
        """ Convert an internationalized message as returned from modules to an object of this dataclass.
        
        Raises:
        - ValueError, if the value is a list of the wrong length or if the value is of a non-supported type
        
        """
        if isinstance(value, str):
            return cls.TODO_i18n(value)
        elif isinstance(value, tuple):
            if len(value) < 2 or len(value) > 3:
                raise ValueError(
                    "This tuple cannot be coerced to I18nMessage: %s" % value
                )
            if not isinstance(value[0], str):
                raise ValueError(
                    "Message ID must be string, got %s" % type(value[0]).__name__
                )
            if not isinstance(value[1], dict):
                raise ValueError(
                    "Message arguments must be a dict, got %s" % type(value[1]).__name__
                )
            if len(value) == 3:
                source = value[2]
                if source not in ["module", "cjwmodule", "cjwparse", None]:
                    raise ValueError("Invalid i18n message source %r" % source)
            else:
                source = None
            return cls(value[0], value[1], source)
        else:
            raise ValueError(
                "%s is of type %s, which cannot be coerced to I18nMessage"
                % (value, type(value).__name__)
            )

Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_dataframe_all_null_text_column(self):
        assert_arrow_table_equals(
            dataframe_to_arrow_table(
                pd.DataFrame({"A": [None]}, dtype=str),
                [Column("A", ColumnType.TEXT())],
                self.path,
            ),
            arrow_table({"A": pyarrow.array([None], pyarrow.string())}),
        )

Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_coerce_from_string(self):
        self.assertEqual(
            I18nMessage.coerce("some string"),
            I18nMessage("TODO_i18n", {"text": "some string"}),
        )

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype)

Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ]

Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_from_string(self):
        self.assertEqual(
            ProcessResultError.coerce("some string"),
            ProcessResultError(I18nMessage.TODO_i18n("some string")),
        )

Source File: csv_ext.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def to_varname(string):
    """Converts string to correct Python variable name.
    Replaces unavailable symbols with _ and insert _ if string starts with digit.
    """
    import re
    return re.sub(r'\W|^(?=\d)','_', string)

Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def generic(self, args, kws):
        assert not kws
        assert len(args) == 4
        if args[2] == types.intp:  # string read call, returns string array
            return signature(string_array_type, *unliteral_all(args))
        # array_ty = types.Array(ndim=1, layout='C', dtype=args[2])
        return signature(types.int64, *unliteral_all(args))

Source File: base.py From fletcher with MIT License

5 votes

def __str__(self) -> str:
        """Convert to string."""
        return f"fletcher_chunked[{self.arrow_dtype}]"

Source File: benchmarks.py From fletcher with MIT License

5 votes

def setup(self):
        array = generate_test_array_non_null(2 ** 17)
        self.df = pd.DataFrame({"str": array})
        self.df_ext = pd.DataFrame(
            {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))}
        )

Source File: test_pandas_integration.py From fletcher with MIT License

5 votes

def test_concatenate_blocks():
    v1 = fr.FletcherChunkedArray(TEST_ARRAY)
    s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True)
    result = pd.concat([s, s], ignore_index=True)
    expected = pd.Series(
        fr.FletcherChunkedArray(
            pa.array(["Test", "string", None, "Test", "string", None])
        )
    )
    tm.assert_series_equal(result, expected)


# ----------------------------------------------------------------------------
# Public Constructors
# ----------------------------------------------------------------------------

Source File: benchmarks.py From fletcher with MIT License

5 votes

def setup(self):
        array = generate_test_array(2 ** 17)
        self.df = pd.DataFrame({"str": array})
        self.df_ext = pd.DataFrame(
            {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))}
        )

Source File: test_text.py From fletcher with MIT License

5 votes

def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()):
    arrow_data = pa.array(data, type=dtype)
    if fletcher_variant == "chunked":
        fr_array = fr.FletcherChunkedArray(arrow_data)
    else:
        fr_array = fr.FletcherContinuousArray(arrow_data)
    return pd.Series(fr_array)

Source File: benchmarks.py From fletcher with MIT License

5 votes

def setup(self, chunked, value, indices):
        # assert np.isscalar(values) or len(values) == len(indices)
        array = generate_test_array(self.n)
        if indices == "int":
            if value == "array_value":
                raise NotImplementedError()
            self.indexer = 50
        elif indices == "int_array":
            self.indexer = list(range(0, self.n, 5))
        elif indices == "bool_array":
            self.indexer = np.zeros(self.n, dtype=bool)
            self.indexer[list(range(0, self.n, 5))] = True
        elif indices == "slice":
            self.indexer = slice(0, self.n, 5)

        if value == "scalar_value":
            self.value = "setitem"
        elif value == "array_value":
            self.value = [str(x) for x in range(self.n)]
            self.value = np.array(self.value)[self.indexer]
            if len(self.value) == 1:
                self.value = self.value[0]

        self.df = pd.DataFrame({"str": array})
        if chunked:
            array = np.array_split(array, 1000)
        else:
            array = [array]
        self.df_ext = pd.DataFrame(
            {
                "str": fr.FletcherChunkedArray(
                    pa.chunked_array([pa.array(chunk, pa.string()) for chunk in array])
                )
            }
        )

Source File: test_pandas_integration.py From fletcher with MIT License

5 votes

def test_nbytes():
    array = fr.FletcherChunkedArray(pa.array(["A", None, "CC"]))
    # Minimal storage usage:
    # 1 byte for the valid bitmap
    # 4 bytes for the offset array
    # 3 bytes for the actual string content
    assert array.nbytes >= 8

Source File: test_pandas_integration.py From fletcher with MIT License

5 votes

def test_getitem_scalar():
    ser = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY))
    result = ser[1]
    assert result == "string"

Source File: test_io.py From fletcher with MIT License

5 votes

def test_read_parquet(tmpdir, continuous):
    str_arr = pa.array(["a", None, "c"], pa.string())
    int_arr = pa.array([1, None, -2], pa.int32())
    bool_arr = pa.array([True, None, False], pa.bool_())
    table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"])

    pq.write_table(table, "df.parquet")
    result = fr.read_parquet("df.parquet", continuous=continuous)
    expected = fr.pandas_from_arrow(table, continuous=continuous)
    tm.assert_frame_equal(result, expected)

Source File: test_base.py From fletcher with MIT License

5 votes

def array_inhom_chunks():
    chunk1 = pa.array(list("abc"), pa.string())
    chunk2 = pa.array(list("12345"), pa.string())
    chunk3 = pa.array(list("Z"), pa.string())
    chunked_array = pa.chunked_array([chunk1, chunk2, chunk3])
    return fr.FletcherChunkedArray(chunked_array)

Python pyarrow.string() Examples