Python pyarrow.string() Examples
The following are 30
code examples of pyarrow.string().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_dataframe_all_null_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ), )
Example #2
Source File: base.py From fletcher with MIT License | 6 votes |
def __eq__(self, other) -> bool: """Check whether 'other' is equal to self. By default, 'other' is considered equal if * it's a string matching 'self.name'. * it's an instance of this type. Parameters ---------- other : Any Returns ------- bool """ if isinstance(other, str): return other == self.name elif isinstance(other, type(self)): return self.arrow_dtype == other.arrow_dtype else: return False
Example #3
Source File: test_numba_integration.py From fletcher with MIT License | 6 votes |
def test_string_builder_simple(data): builder = NumbaStringArrayBuilder(2, 6) for s in data: if s is None: builder.finish_null() continue for c in s: builder.put_byte(ord(c)) builder.finish_string() builder.finish() expected = pa.array(data, pa.string()) missing, offsets, data = buffers_as_arrays(expected) np.testing.assert_array_equal(builder.offsets, offsets) np.testing.assert_array_equal(builder.data, data)
Example #4
Source File: types.py From LearningApacheSpark with MIT License | 6 votes |
def __init__(self, name, dataType, nullable=True, metadata=None): """ >>> (StructField("f1", StringType(), True) ... == StructField("f1", StringType(), True)) True >>> (StructField("f1", StringType(), True) ... == StructField("f2", StringType(), True)) False """ assert isinstance(dataType, DataType),\ "dataType %s should be an instance of %s" % (dataType, DataType) assert isinstance(name, basestring), "field name %s should be string" % (name) if not isinstance(name, str): name = name.encode('utf-8') self.name = name self.dataType = dataType self.nullable = nullable self.metadata = metadata or {}
Example #5
Source File: test_text.py From fletcher with MIT License | 6 votes |
def test_text_zfill(data, fletcher_variant): if any("\x00" in x for x in data if x): # pytest.skip("pandas cannot handle \\x00 characters in tests") # Skip is not working properly with hypothesis return ser_pd = pd.Series(data, dtype=str) max_str_len = ser_pd.map(_optional_len).max() if pd.isna(max_str_len): max_str_len = 0 arrow_data = pa.array(data, type=pa.string()) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) ser_fr = pd.Series(fr_array) result_pd = ser_pd.str.zfill(max_str_len + 1) result_fr = ser_fr.fr_text.zfill(max_str_len + 1) result_fr = result_fr.astype(object) # Pandas returns np.nan for NA values in cat, keep this in line result_fr[result_fr.isna()] = np.nan tm.assert_series_equal(result_fr, result_pd)
Example #6
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_dataframe_to_arrow_dict_sequence_schema(module_under_test): dict_schema = [ {"name": "field01", "type": "STRING", "mode": "REQUIRED"}, {"name": "field02", "type": "BOOL", "mode": "NULLABLE"}, ] dataframe = pandas.DataFrame( {"field01": [u"hello", u"world"], "field02": [True, False]} ) arrow_table = module_under_test.dataframe_to_arrow(dataframe, dict_schema) arrow_schema = arrow_table.schema expected_fields = [ pyarrow.field("field01", "string", nullable=False), pyarrow.field("field02", "bool", nullable=True), ] assert list(arrow_schema) == expected_fields
Example #7
Source File: string.py From fletcher with MIT License | 6 votes |
def _text_cat(a: pa.Array, b: pa.Array) -> pa.Array: if len(a) != len(b): raise ValueError("Lengths of arrays don't match") offsets_a, data_a = _extract_string_buffers(a) offsets_b, data_b = _extract_string_buffers(b) if len(a) > 0: valid = _merge_valid_bitmaps(a, b) result_offsets = np.empty(len(a) + 1, dtype=np.int32) result_offsets[0] = 0 total_size = (offsets_a[-1] - offsets_a[0]) + (offsets_b[-1] - offsets_b[0]) result_data = np.empty(total_size, dtype=np.uint8) _merge_string_data( len(a), valid, offsets_a, data_a, offsets_b, data_b, result_offsets, result_data, ) buffers = [pa.py_buffer(x) for x in [valid, result_offsets, result_data]] return pa.Array.from_buffers(pa.string(), len(a), buffers) return a
Example #8
Source File: parquet_util.py From professional-services with Apache License 2.0 | 6 votes |
def get_pa_translated_schema(self): """Translates a BigQuery schema to an parquet schema. Returns: Translated parquet schema in pyarrow.Schema format. """ type_conversions = { 'STRING': pa.string(), 'NUMERIC': pa.int64(), } # TODO(annarudy@google.com): add support for nested fields pa_schema_list = [ pa.field( bq_field.name, type_conversions[bq_field.field_type], ) for bq_field in self.bq_schema ] return pa.schema(pa_schema_list)
Example #9
Source File: test_db.py From aws-data-wrangler with Apache License 2.0 | 6 votes |
def test_redshift_spectrum_long_string(path, glue_table, glue_database, redshift_external_schema): df = pd.DataFrame( { "id": [1, 2], "col_str": [ "".join(random.choice(string.ascii_letters) for _ in range(300)), "".join(random.choice(string.ascii_letters) for _ in range(300)), ], } ) paths = wr.s3.to_parquet( df=df, path=path, database=glue_database, table=glue_table, mode="overwrite", index=False, dataset=True )["paths"] wr.s3.wait_objects_exist(paths=paths, use_threads=False) engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") with engine.connect() as con: cursor = con.execute(f"SELECT * FROM {redshift_external_schema}.{glue_table}") rows = cursor.fetchall() assert len(rows) == len(df.index) for row in rows: assert len(row) == len(df.columns)
Example #10
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_num_bytes_getter(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) table = self._make_one(table_ref) # Check with no value set. self.assertIsNone(table.num_bytes) num_bytes = 1337 # Check with integer value set. table._properties = {"numBytes": num_bytes} self.assertEqual(table.num_bytes, num_bytes) # Check with a string value set. table._properties = {"numBytes": str(num_bytes)} self.assertEqual(table.num_bytes, num_bytes) # Check with invalid int value. table._properties = {"numBytes": "x"} with self.assertRaises(ValueError): getattr(table, "num_bytes")
Example #11
Source File: dataset.py From kartothek with MIT License | 6 votes |
def load_from_buffer( buf, store: KeyValueStore, format: str = "json" ) -> "DatasetMetadata": """ Load a dataset from a (string) buffer. Parameters ---------- buf: Input to be parsed. store: Object that implements the .get method for file/object loading. Returns ------- dataset_metadata: Parsed metadata. """ if format == "json": metadata = load_json(buf) elif format == "msgpack": metadata = msgpack.unpackb(buf) return DatasetMetadata.load_from_dict(metadata, store)
Example #12
Source File: test_numba_integration.py From fletcher with MIT License | 5 votes |
def test_str_length(array, expected, offset): array = pa.array(array, pa.string())[offset:] np.testing.assert_array_equal( str_length(NumbaStringArray.make(array)), # type: ignore np.asarray(expected[offset:], dtype=np.int32), )
Example #13
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def coerce(cls, value: mtypes.Message) -> I18nMessage: """ Convert an internationalized message as returned from modules to an object of this dataclass. Raises: - ValueError, if the value is a list of the wrong length or if the value is of a non-supported type """ if isinstance(value, str): return cls.TODO_i18n(value) elif isinstance(value, tuple): if len(value) < 2 or len(value) > 3: raise ValueError( "This tuple cannot be coerced to I18nMessage: %s" % value ) if not isinstance(value[0], str): raise ValueError( "Message ID must be string, got %s" % type(value[0]).__name__ ) if not isinstance(value[1], dict): raise ValueError( "Message arguments must be a dict, got %s" % type(value[1]).__name__ ) if len(value) == 3: source = value[2] if source not in ["module", "cjwmodule", "cjwparse", None]: raise ValueError("Invalid i18n message source %r" % source) else: source = None return cls(value[0], value[1], source) else: raise ValueError( "%s is of type %s, which cannot be coerced to I18nMessage" % (value, type(value).__name__) )
Example #14
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
Example #15
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_coerce_from_string(self): self.assertEqual( I18nMessage.coerce("some string"), I18nMessage("TODO_i18n", {"text": "some string"}), )
Example #16
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType: if dtype == np.int8: return pyarrow.int8() elif dtype == np.int16: return pyarrow.int16() elif dtype == np.int32: return pyarrow.int32() elif dtype == np.int64: return pyarrow.int64() elif dtype == np.uint8: return pyarrow.uint8() elif dtype == np.uint16: return pyarrow.uint16() elif dtype == np.uint32: return pyarrow.uint32() elif dtype == np.uint64: return pyarrow.uint64() elif dtype == np.float16: return pyarrow.float16() elif dtype == np.float32: return pyarrow.float32() elif dtype == np.float64: return pyarrow.float64() elif dtype.kind == "M": # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 assert dtype.str.endswith("[ns]") return pyarrow.timestamp(unit="ns", tz=None) elif dtype == np.object_: return pyarrow.string() else: raise RuntimeError("Unhandled dtype %r" % dtype)
Example #17
Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def _get_numba_typ_from_pa_typ(pa_typ): import pyarrow as pa _typ_map = { # boolean pa.bool_(): types.bool_, # signed int types pa.int8(): types.int8, pa.int16(): types.int16, pa.int32(): types.int32, pa.int64(): types.int64, # unsigned int types pa.uint8(): types.uint8, pa.uint16(): types.uint16, pa.uint32(): types.uint32, pa.uint64(): types.uint64, # float types (TODO: float16?) pa.float32(): types.float32, pa.float64(): types.float64, # String pa.string(): string_type, # date pa.date32(): types.NPDatetime('ns'), pa.date64(): types.NPDatetime('ns'), # time (TODO: time32, time64, ...) pa.timestamp('ns'): types.NPDatetime('ns'), pa.timestamp('us'): types.NPDatetime('ns'), pa.timestamp('ms'): types.NPDatetime('ns'), pa.timestamp('s'): types.NPDatetime('ns'), } if pa_typ not in _typ_map: raise ValueError("Arrow data type {} not supported yet".format(pa_typ)) return _typ_map[pa_typ]
Example #18
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_from_string(self): self.assertEqual( ProcessResultError.coerce("some string"), ProcessResultError(I18nMessage.TODO_i18n("some string")), )
Example #19
Source File: csv_ext.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def to_varname(string): """Converts string to correct Python variable name. Replaces unavailable symbols with _ and insert _ if string starts with digit. """ import re return re.sub(r'\W|^(?=\d)','_', string)
Example #20
Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def generic(self, args, kws): assert not kws assert len(args) == 4 if args[2] == types.intp: # string read call, returns string array return signature(string_array_type, *unliteral_all(args)) # array_ty = types.Array(ndim=1, layout='C', dtype=args[2]) return signature(types.int64, *unliteral_all(args))
Example #21
Source File: base.py From fletcher with MIT License | 5 votes |
def __str__(self) -> str: """Convert to string.""" return f"fletcher_chunked[{self.arrow_dtype}]"
Example #22
Source File: benchmarks.py From fletcher with MIT License | 5 votes |
def setup(self): array = generate_test_array_non_null(2 ** 17) self.df = pd.DataFrame({"str": array}) self.df_ext = pd.DataFrame( {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))} )
Example #23
Source File: test_pandas_integration.py From fletcher with MIT License | 5 votes |
def test_concatenate_blocks(): v1 = fr.FletcherChunkedArray(TEST_ARRAY) s = pd.Series(v1, index=pd.RangeIndex(3), fastpath=True) result = pd.concat([s, s], ignore_index=True) expected = pd.Series( fr.FletcherChunkedArray( pa.array(["Test", "string", None, "Test", "string", None]) ) ) tm.assert_series_equal(result, expected) # ---------------------------------------------------------------------------- # Public Constructors # ----------------------------------------------------------------------------
Example #24
Source File: benchmarks.py From fletcher with MIT License | 5 votes |
def setup(self): array = generate_test_array(2 ** 17) self.df = pd.DataFrame({"str": array}) self.df_ext = pd.DataFrame( {"str": fr.FletcherChunkedArray(pa.array(array, pa.string()))} )
Example #25
Source File: test_text.py From fletcher with MIT License | 5 votes |
def _fr_series_from_data(data, fletcher_variant, dtype=pa.string()): arrow_data = pa.array(data, type=dtype) if fletcher_variant == "chunked": fr_array = fr.FletcherChunkedArray(arrow_data) else: fr_array = fr.FletcherContinuousArray(arrow_data) return pd.Series(fr_array)
Example #26
Source File: benchmarks.py From fletcher with MIT License | 5 votes |
def setup(self, chunked, value, indices): # assert np.isscalar(values) or len(values) == len(indices) array = generate_test_array(self.n) if indices == "int": if value == "array_value": raise NotImplementedError() self.indexer = 50 elif indices == "int_array": self.indexer = list(range(0, self.n, 5)) elif indices == "bool_array": self.indexer = np.zeros(self.n, dtype=bool) self.indexer[list(range(0, self.n, 5))] = True elif indices == "slice": self.indexer = slice(0, self.n, 5) if value == "scalar_value": self.value = "setitem" elif value == "array_value": self.value = [str(x) for x in range(self.n)] self.value = np.array(self.value)[self.indexer] if len(self.value) == 1: self.value = self.value[0] self.df = pd.DataFrame({"str": array}) if chunked: array = np.array_split(array, 1000) else: array = [array] self.df_ext = pd.DataFrame( { "str": fr.FletcherChunkedArray( pa.chunked_array([pa.array(chunk, pa.string()) for chunk in array]) ) } )
Example #27
Source File: test_pandas_integration.py From fletcher with MIT License | 5 votes |
def test_nbytes(): array = fr.FletcherChunkedArray(pa.array(["A", None, "CC"])) # Minimal storage usage: # 1 byte for the valid bitmap # 4 bytes for the offset array # 3 bytes for the actual string content assert array.nbytes >= 8
Example #28
Source File: test_pandas_integration.py From fletcher with MIT License | 5 votes |
def test_getitem_scalar(): ser = pd.Series(fr.FletcherChunkedArray(TEST_ARRAY)) result = ser[1] assert result == "string"
Example #29
Source File: test_io.py From fletcher with MIT License | 5 votes |
def test_read_parquet(tmpdir, continuous): str_arr = pa.array(["a", None, "c"], pa.string()) int_arr = pa.array([1, None, -2], pa.int32()) bool_arr = pa.array([True, None, False], pa.bool_()) table = pa.Table.from_arrays([str_arr, int_arr, bool_arr], ["str", "int", "bool"]) pq.write_table(table, "df.parquet") result = fr.read_parquet("df.parquet", continuous=continuous) expected = fr.pandas_from_arrow(table, continuous=continuous) tm.assert_frame_equal(result, expected)
Example #30
Source File: test_base.py From fletcher with MIT License | 5 votes |
def array_inhom_chunks(): chunk1 = pa.array(list("abc"), pa.string()) chunk2 = pa.array(list("12345"), pa.string()) chunk3 = pa.array(list("Z"), pa.string()) chunked_array = pa.chunked_array([chunk1, chunk2, chunk3]) return fr.FletcherChunkedArray(chunked_array)