From 54e29f65e755b60bf1187524baee558c4865192d Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Wed, 3 Nov 2021 19:52:55 +0100 Subject: [PATCH 001/616] add static partition assignment possibilities --- faust/transport/drivers/aiokafka.py | 2 +- requirements/requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/transport/drivers/aiokafka.py b/faust/transport/drivers/aiokafka.py index f2f244db9..b38505854 100644 --- a/faust/transport/drivers/aiokafka.py +++ b/faust/transport/drivers/aiokafka.py @@ -499,7 +499,7 @@ def _create_worker_consumer( api_version=conf.consumer_api_version, client_id=conf.broker_client_id, group_id=conf.id, - # group_instance_id=conf.consumer_group_instance_id, + group_instance_id=conf.consumer_group_instance_id, bootstrap_servers=server_list(transport.url, transport.default_port), partition_assignment_strategy=[self._assignor], enable_auto_commit=False, diff --git a/requirements/requirements.txt b/requirements/requirements.txt index dfe8382e5..2676ca3b8 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,7 @@ aiohttp>=3.5.2,<4.0 aiohttp_cors>=0.7,<2.0 -aiokafka>=0.7.1,<0.8.0 +aiokafka @ git+https://github.com/smaxtec/aiokafka-1@master#egg=aiokafka +#aiokafka>=0.7.1,<0.8.0 click>=6.7,<8.0 colorclass>=2.2,<3.0 mode-streaming==0.1.0 From ae479f4d82c63ba66ed778faa245d74b28fd5c77 Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Wed, 3 Nov 2021 20:12:20 +0100 Subject: [PATCH 002/616] add global partitions as actives as well --- faust/assignor/partition_assignor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index f5e6922f7..9e00af477 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -306,6 +306,9 @@ def _global_table_standby_assignments( # Only add those partitions as standby which aren't active standby_partitions = all_partitions - active_partitions assignment.standbys[changelog_topic_name] = list(standby_partitions) + # We add all_partitions as active so they are recovered + # in the beginning. + assignment.actives[changelog_topic_name] = list(all_partitions) return assignments def _protocol_assignments( From da5f06cda8069e6dadf421454ec65381b7fedb6b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 8 Sep 2022 15:07:10 +0200 Subject: [PATCH 003/616] first version of bigtable store --- faust/stores/__init__.py | 1 + faust/stores/bigtable.py | 219 +++++++++++++++++++++++++++++++ requirements/extras/bigtable.txt | 1 + 3 files changed, 221 insertions(+) create mode 100644 faust/stores/bigtable.py create mode 100644 requirements/extras/bigtable.txt diff --git a/faust/stores/__init__.py b/faust/stores/__init__.py index 3f536427b..2bc207034 100644 --- a/faust/stores/__init__.py +++ b/faust/stores/__init__.py @@ -11,6 +11,7 @@ memory="faust.stores.memory:Store", rocksdb="faust.stores.rocksdb:Store", aerospike="faust.stores.aerospike:AeroSpikeStore", + bigtable="faust.stores.bigtable:BigTableStore", ) STORES.include_setuptools_namespace("faust.stores") by_name = STORES.by_name diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py new file mode 100644 index 000000000..7675b69e5 --- /dev/null +++ b/faust/stores/bigtable.py @@ -0,0 +1,219 @@ +"""BigTable storage.""" +import typing +from typing import Any, Iterator, Optional, Tuple, Union + +from google.cloud.bigtable import column_family +from google.cloud.bigtable.client import Client +from google.cloud.bigtable.instance import Instance +from google.cloud.bigtable.row_filters import CellsColumnLimitFilter +from google.cloud.bigtable.table import Table +from yarl import URL + +from faust.stores import base +from faust.types import TP, AppT, CollectionT + +bigtable = None + +class BigTableStore(base.SerializedStore): + """Bigtable table storage.""" + + client: Client + instance: Instance + table: Table + PROJECT_KEY = "project_key" + INSTANCE_KEY = "instance_key" + + def __init__( + self, + url: Union[str, URL], + app: AppT, + table: CollectionT, + options: Optional[typing.Dict[str, Any]] = None, + **kwargs: Any, + ) -> None: + try: + self.client = Client( + options.get(BigTableStore.PROJECT_KEY), + admin=True, + ) + self.instance = self.client.instance( + options.get(BigTableStore.INSTANCE_KEY) + ) + + existing_tables = [t.table_id for t in self.instance.list_tables()] + table_exists = self.table_name in existing_tables + self.table = self.instance.table(self.table_name) + if not table_exists: + self.table.create() + else: + self.table = self.instance.table(self.table_name) + existing_cf = [ + cf.column_family_id for cf in self.table.list_column_families() + ] + + column_family_id = "FaustColumnFamily" + cf_exists = column_family_id in existing_cf + self.column_family = self.table.column_family( + column_family_id, + gc_rule=column_family.MaxVersionsGCRule(1), + ) + if not cf_exists: + self.column_family.create() + self.column_name = "DATA" + + table.use_partitioner = True + except Exception as ex: + self.logger.error(f"Error configuring bigtable client {ex}") + raise ex + super().__init__(url, app, table, **kwargs) + + def bigtable_extract_row_data(self, row_data): + return list(row_data.to_dict().values())[0][0].value + + def _get(self, key: bytes) -> Optional[bytes]: + filter = CellsColumnLimitFilter(1) + try: + res = self.table.read_row(key, filter_=filter) + if res is None: + raise KeyError(f"row {key} not found in bigtable {self.table=}") + return self.bigtable_extract_row_data(res) + except ValueError as ex: + self.log.debug(f"key not found {key} exception {ex}") + raise KeyError(f"key not found {key}") + except Exception as ex: + self.log.error( + f"Error in get for table {self.table_name} exception {ex} key {key}" + ) + raise ex + + def _set(self, key: bytes, value: Optional[bytes]) -> None: + try: + row = self.table.direct_row(key) + row.set_cell(self.column_family.column_family_id, self.column_name, value) + row.commit() + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in set for " + f"table {self.table_name} exception {ex} key {key}" + ) + raise ex + + def _del(self, key: bytes) -> None: + try: + row = self.table.direct_row(key) + row.delete() + row.commit() + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in delete for " + f"table {self.table_name} exception {ex} key {key}" + ) + raise ex + + def _iterkeys(self) -> Iterator[bytes]: + try: + for key, val in self.table.scan(): + yield key + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in _iterkeys " + f"for table {self.table_name} exception {ex}" + ) + raise ex + + def _itervalues(self) -> Iterator[bytes]: + try: + for key, val in self.table.scan(): + yield self.bigtable_extract_row_data(val) + except Exception as ex: + self.log.error( + f"FaustBigtableException Error " + f"in _itervalues for table {self.table_name}" + f" exception {ex}" + ) + raise ex + + def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: + try: + for key, val in self.table.scan(): + yield key, self.bigtable_extract_row_data(val) + except Exception as ex: + self.log.error( + f"FaustBigtableException Error " + f"in _iteritems for table {self.table_name}" + f" exception {ex}" + ) + raise ex + + def _size(self) -> int: + """Always returns 0 for Bigtable.""" + return 0 + + def _contains(self, key: bytes) -> bool: + try: + for k in self._iterkeys(): + if k == key: + return True + return False + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in _contains for table " + f"{self.table_name} exception " + f"{ex} key {key}" + ) + raise ex + + def _clear(self) -> None: + """This is typically used to clear data. + + This does nothing when using the Bigtable store. + + """ + ... + + def reset_state(self) -> None: + """Remove system state. + + This does nothing when using the Bigtable store. + + """ + ... + + def persisted_offset(self, tp: TP) -> Optional[int]: + """Return the persisted offset. + + This always returns :const:`None` when using the bigtable store. + """ + return None + + async def backup_partition( + self, + tp: Union[TP, int], + flush: bool = True, + purge: bool = False, + keep: int = 1, + ) -> None: + """Backup partition from this store. + + Not yet implemented for Bigtable. + + """ + raise NotImplementedError("Not yet implemented for Bigtable.") + + def restore_backup( + self, tp: Union[TP, int], latest: bool = True, backup_id: int = 0 + ) -> None: + """Restore partition backup from this store. + + Not yet implemented for Bigtable. + + """ + raise NotImplementedError("Not yet implemented for Bigtable.") + + +if __name__ == "__main__": + options = { + BigTableStoreTest.PROJECT_KEY: "dev-smaxtec-system", + BigTableStoreTest.INSTANCE_KEY: "faust-store-test", + } + store = BigTableStoreTest(options) diff --git a/requirements/extras/bigtable.txt b/requirements/extras/bigtable.txt new file mode 100644 index 000000000..47acfcb87 --- /dev/null +++ b/requirements/extras/bigtable.txt @@ -0,0 +1 @@ +google-cloud-bigtable From 207dff5e3cf6d07be2a76cdc0b349c0bbe0afb0f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 8 Sep 2022 15:36:50 +0200 Subject: [PATCH 004/616] added bigtable bundle --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 0e17eeed9..1758be272 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ BUNDLES = { "aiodns", "aiomonitor", + "bigtable", "cchardet", "ciso8601", "cython", From 67b9299ad905f3aacc9181c96160d2d7dfcf8a91 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 8 Sep 2022 16:10:32 +0200 Subject: [PATCH 005/616] added new line --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7675b69e5..a022d7d25 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -14,6 +14,7 @@ bigtable = None + class BigTableStore(base.SerializedStore): """Bigtable table storage.""" From 80c9619bb48125487894fb142c5de54cf5822aa1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 12 Sep 2022 13:45:18 +0200 Subject: [PATCH 006/616] make get and set with static table and add prefix for all keys --- faust/stores/bigtable.py | 60 ++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a022d7d25..419ebe68c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,4 +1,5 @@ """BigTable storage.""" +import logging import typing from typing import Any, Iterator, Optional, Tuple, Union @@ -23,16 +24,20 @@ class BigTableStore(base.SerializedStore): table: Table PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" + TABLE_NAME_KEY = "table_name_key" def __init__( self, url: Union[str, URL], app: AppT, table: CollectionT, - options: Optional[typing.Dict[str, Any]] = None, + options: typing.Dict[str, Any], **kwargs: Any, ) -> None: try: + logging.getLogger(__name__).error( + f"BigTableStore: Making bigtable with {self.table_name=}" + ) self.client = Client( options.get(BigTableStore.PROJECT_KEY), admin=True, @@ -43,11 +48,12 @@ def __init__( existing_tables = [t.table_id for t in self.instance.list_tables()] table_exists = self.table_name in existing_tables - self.table = self.instance.table(self.table_name) + self.bt_table_prefix = options.get(BigTableStore.TABLE_NAME_KEY) + self.table = self.instance.table(self.bt_table_prefix) if not table_exists: self.table.create() else: - self.table = self.instance.table(self.table_name) + self.table = self.instance.table(self.bt_table_prefix) existing_cf = [ cf.column_family_id for cf in self.table.list_column_families() ] @@ -71,10 +77,18 @@ def __init__( def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value + def get_bigtable_key(self, key: bytes) -> bytes: + + return bytes(f"{self.table_name}_{key.decode("utf-8")}") + + def get_access_key(self, bt_key: bytes) -> bytes: + return bytes(bt_key.decode("utf-8").removeprefix(f"{self.table_name}_")) + def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: - res = self.table.read_row(key, filter_=filter) + bt_key = self.get_bigtable_key(key) + res = self.table.read_row(bt_key, filter_=filter) if res is None: raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) @@ -89,7 +103,8 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - row = self.table.direct_row(key) + bt_key = self.get_bigtable_key(key) + row = self.table.direct_row(bt_key) row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() except Exception as ex: @@ -101,6 +116,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: + bt_key = self.get_bigtable_key(key) row = self.table.direct_row(key) row.delete() row.commit() @@ -114,7 +130,7 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: for key, val in self.table.scan(): - yield key + yield self.get_access_(key) except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -137,7 +153,7 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: for key, val in self.table.scan(): - yield key, self.bigtable_extract_row_data(val) + yield self.get_access_(key), self.bigtable_extract_row_data(val) except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -212,9 +228,35 @@ def restore_backup( raise NotImplementedError("Not yet implemented for Bigtable.") +class BigTableStoreTest(BigTableStore): + def __init__( + self, + options: Optional[typing.Dict[str, Any]] = None, + ) -> None: + try: + self.client = Client(options.get(BigTableStore.PROJECT_KEY)) + self.instance = self.client.instance( + options.get(BigTableStore.INSTANCE_KEY) + ) + + self.table = self.instance.table(self.bt_table_prefix) + column_family_id = "FaustColumnFamily" + self.column_family = self.table.column_family( + column_family_id, + gc_rule=column_family.MaxVersionsGCRule(1), + ) + self.column_name = "DATA" + + except Exception as ex: + self.logger.error(f"Error configuring bigtable client {ex}") + raise ex + + if __name__ == "__main__": options = { - BigTableStoreTest.PROJECT_KEY: "dev-smaxtec-system", - BigTableStoreTest.INSTANCE_KEY: "faust-store-test", + BigTableStoreTest.PROJECT_KEY: "smaxtec-system", + BigTableStoreTest.INSTANCE_KEY: "faust-cache-test", + BigTableStoreTest.TABLE_NAME_KEY: "sxfaust_cache", } store = BigTableStoreTest(options) + pass From fba8c4f562daa46921c9da2e9791dfd72d40f6c6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 12 Sep 2022 13:49:47 +0200 Subject: [PATCH 007/616] changed variable name --- faust/stores/bigtable.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 419ebe68c..98ac9e51d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -36,7 +36,7 @@ def __init__( ) -> None: try: logging.getLogger(__name__).error( - f"BigTableStore: Making bigtable with {self.table_name=}" + f"BigTableStore: Making bigtablestore with {self.table_name=}" ) self.client = Client( options.get(BigTableStore.PROJECT_KEY), @@ -48,12 +48,12 @@ def __init__( existing_tables = [t.table_id for t in self.instance.list_tables()] table_exists = self.table_name in existing_tables - self.bt_table_prefix = options.get(BigTableStore.TABLE_NAME_KEY) - self.table = self.instance.table(self.bt_table_prefix) + self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) + self.table = self.instance.table(self.bt_table_name) if not table_exists: self.table.create() else: - self.table = self.instance.table(self.bt_table_prefix) + self.table = self.instance.table(self.bt_table_name) existing_cf = [ cf.column_family_id for cf in self.table.list_column_families() ] @@ -239,7 +239,7 @@ def __init__( options.get(BigTableStore.INSTANCE_KEY) ) - self.table = self.instance.table(self.bt_table_prefix) + self.table = self.instance.table(self.bt_table_name) column_family_id = "FaustColumnFamily" self.column_family = self.table.column_family( column_family_id, From 667c15e3a9512126545ca7abe3b46aa5a8e5fe8c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 12 Sep 2022 16:10:20 +0200 Subject: [PATCH 008/616] fixed wrong syntax --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 98ac9e51d..741aa1708 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -78,8 +78,8 @@ def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def get_bigtable_key(self, key: bytes) -> bytes: - - return bytes(f"{self.table_name}_{key.decode("utf-8")}") + decoded_key = key.decode('utf-8') + return bytes(f"{self.table_name}_{decoded_key}") def get_access_key(self, bt_key: bytes) -> bytes: return bytes(bt_key.decode("utf-8").removeprefix(f"{self.table_name}_")) From 3404e64bfb20eef819346b234aa9c9cef861564e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 06:31:15 +0200 Subject: [PATCH 009/616] added table_name variabel --- faust/stores/bigtable.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 741aa1708..1f17ba5e9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -34,38 +34,25 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: + self.table_name = table.name try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" ) self.client = Client( options.get(BigTableStore.PROJECT_KEY), - admin=True, ) self.instance = self.client.instance( options.get(BigTableStore.INSTANCE_KEY) ) - existing_tables = [t.table_id for t in self.instance.list_tables()] - table_exists = self.table_name in existing_tables self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) self.table = self.instance.table(self.bt_table_name) - if not table_exists: - self.table.create() - else: - self.table = self.instance.table(self.bt_table_name) - existing_cf = [ - cf.column_family_id for cf in self.table.list_column_families() - ] - column_family_id = "FaustColumnFamily" - cf_exists = column_family_id in existing_cf self.column_family = self.table.column_family( column_family_id, gc_rule=column_family.MaxVersionsGCRule(1), ) - if not cf_exists: - self.column_family.create() self.column_name = "DATA" table.use_partitioner = True From 14ee6fb272426afb8a5147f6614ef7b0fed33650 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 06:59:11 +0200 Subject: [PATCH 010/616] added encoding --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1f17ba5e9..499828a14 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -66,10 +66,10 @@ def bigtable_extract_row_data(self, row_data): def get_bigtable_key(self, key: bytes) -> bytes: decoded_key = key.decode('utf-8') - return bytes(f"{self.table_name}_{decoded_key}") + return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") def get_access_key(self, bt_key: bytes) -> bytes: - return bytes(bt_key.decode("utf-8").removeprefix(f"{self.table_name}_")) + return bytes(bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8") def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) From 77fb397fef14cb25df00939a4f1b84699c34c709 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 06:59:39 +0200 Subject: [PATCH 011/616] added right key to delete --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 499828a14..ab9d40167 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -104,7 +104,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: bt_key = self.get_bigtable_key(key) - row = self.table.direct_row(key) + row = self.table.direct_row(bt_key) row.delete() row.commit() except Exception as ex: From 41896c8406b3e458272fb36657b5de166b5fb958 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 07:18:02 +0200 Subject: [PATCH 012/616] rename overwritten variable --- faust/stores/bigtable.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ab9d40167..e6322f306 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -47,9 +47,9 @@ def __init__( ) self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) - self.table = self.instance.table(self.bt_table_name) + self.bt_table = self.instance.table(self.bt_table_name) column_family_id = "FaustColumnFamily" - self.column_family = self.table.column_family( + self.column_family = self.bt_table.column_family( column_family_id, gc_rule=column_family.MaxVersionsGCRule(1), ) @@ -65,17 +65,19 @@ def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def get_bigtable_key(self, key: bytes) -> bytes: - decoded_key = key.decode('utf-8') + decoded_key = key.decode("utf-8") return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") def get_access_key(self, bt_key: bytes) -> bytes: - return bytes(bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8") + return bytes( + bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" + ) def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: bt_key = self.get_bigtable_key(key) - res = self.table.read_row(bt_key, filter_=filter) + res = self.bt_table.read_row(bt_key, filter_=filter) if res is None: raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) @@ -91,7 +93,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: bt_key = self.get_bigtable_key(key) - row = self.table.direct_row(bt_key) + row = self.bt_table.direct_row(bt_key) row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() except Exception as ex: @@ -104,7 +106,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: bt_key = self.get_bigtable_key(key) - row = self.table.direct_row(bt_key) + row = self.bt_table.direct_row(bt_key) row.delete() row.commit() except Exception as ex: @@ -116,7 +118,7 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - for key, val in self.table.scan(): + for key, val in self.bt_table.scan(): yield self.get_access_(key) except Exception as ex: self.log.error( @@ -127,7 +129,7 @@ def _iterkeys(self) -> Iterator[bytes]: def _itervalues(self) -> Iterator[bytes]: try: - for key, val in self.table.scan(): + for key, val in self.bt_table.scan(): yield self.bigtable_extract_row_data(val) except Exception as ex: self.log.error( @@ -139,7 +141,7 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - for key, val in self.table.scan(): + for key, val in self.bt_table.scan(): yield self.get_access_(key), self.bigtable_extract_row_data(val) except Exception as ex: self.log.error( @@ -226,9 +228,9 @@ def __init__( options.get(BigTableStore.INSTANCE_KEY) ) - self.table = self.instance.table(self.bt_table_name) + self.bt_table = self.instance.table(self.bt_table_name) column_family_id = "FaustColumnFamily" - self.column_family = self.table.column_family( + self.column_family = self.bt_table.column_family( column_family_id, gc_rule=column_family.MaxVersionsGCRule(1), ) From 730499a6dcab4c921aacdba702a069a7004322e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 12:30:21 +0200 Subject: [PATCH 013/616] WIP. scan --- faust/stores/bigtable.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e6322f306..016b6807d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -73,11 +73,27 @@ def get_access_key(self, bt_key: bytes) -> bytes: bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" ) + def get_scan_range_for_sxkeys(self, key: bytes) -> Tuple[bytes, bytes]: + key_id = key.decode("utf-8") + key_id = key_id.split("_")[1] + key_start_str = key_id + key_end_str = key_id + last_char = key_end_str[-1] + key_end_str = key_end_str[:-1] + key_end_str += chr(ord(last_char) + 1) + + key_start: bytes = f"{self.table_name}_{key_start_str}".encode("utf-8") + key_end: bytes = f"{self.table_name}_{key_end_str}".encode("utf-8") + return key_start, key_end + def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: bt_key = self.get_bigtable_key(key) - res = self.bt_table.read_row(bt_key, filter_=filter) + start_key, end_key = self.get_scan_range_for_sxkeys(bt_key) + res = self.bt_table.read_row( + bt_key, start_key=start_key, end_key=end_key, filter_=filter + ) if res is None: raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) @@ -118,8 +134,8 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - for key, val in self.bt_table.scan(): - yield self.get_access_(key) + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -129,8 +145,8 @@ def _iterkeys(self) -> Iterator[bytes]: def _itervalues(self) -> Iterator[bytes]: try: - for key, val in self.bt_table.scan(): - yield self.bigtable_extract_row_data(val) + for row in self._iteritems(): + yield row[1] except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -141,8 +157,9 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - for key, val in self.bt_table.scan(): - yield self.get_access_(key), self.bigtable_extract_row_data(val) + table_prefix = f"{self.table_name}".encode("utf-8") + for row in self.bt_table.read_rows(start_key=table_prefix, end_key=table_prefix): + yield self.get_access_key(row.row_key), self.bigtable_extract_row_data(row) except Exception as ex: self.log.error( f"FaustBigtableException Error " From b358bb2df3eb3482525e01eb766bf8c1db052558 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 13:53:47 +0200 Subject: [PATCH 014/616] added faster scanning --- faust/stores/bigtable.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 016b6807d..8a6a0bb82 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -73,26 +73,21 @@ def get_access_key(self, bt_key: bytes) -> bytes: bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" ) - def get_scan_range_for_sxkeys(self, key: bytes) -> Tuple[bytes, bytes]: - key_id = key.decode("utf-8") - key_id = key_id.split("_")[1] - key_start_str = key_id - key_end_str = key_id - last_char = key_end_str[-1] - key_end_str = key_end_str[:-1] - key_end_str += chr(ord(last_char) + 1) - - key_start: bytes = f"{self.table_name}_{key_start_str}".encode("utf-8") - key_end: bytes = f"{self.table_name}_{key_end_str}".encode("utf-8") - return key_start, key_end + def sx_get_key_prefix(self, key: bytes) -> bytes: + key_id = key.decode("utf-8").split("_")[1] + return f"{self.table_name}_{key_id}".encode("utf-8") def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: bt_key = self.get_bigtable_key(key) - start_key, end_key = self.get_scan_range_for_sxkeys(bt_key) + key_prefix = self.sx_get_key_prefix(bt_key) res = self.bt_table.read_row( - bt_key, start_key=start_key, end_key=end_key, filter_=filter + bt_key, + start_key=key_prefix, + end_key=key_prefix, + end_inclusive=True, + filter_=filter, ) if res is None: raise KeyError(f"row {key} not found in bigtable {self.table=}") @@ -158,8 +153,12 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: table_prefix = f"{self.table_name}".encode("utf-8") - for row in self.bt_table.read_rows(start_key=table_prefix, end_key=table_prefix): - yield self.get_access_key(row.row_key), self.bigtable_extract_row_data(row) + for row in self.bt_table.read_rows( + start_key=table_prefix, end_key=table_prefix, end_inclusive=True + ): + yield self.get_access_key(row.row_key), self.bigtable_extract_row_data( + row + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " From 31f290a21d5ff6bde30c7a6d6a14697921fac431 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 13:54:55 +0200 Subject: [PATCH 015/616] formatting --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8a6a0bb82..523ca93fc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -156,8 +156,9 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for row in self.bt_table.read_rows( start_key=table_prefix, end_key=table_prefix, end_inclusive=True ): - yield self.get_access_key(row.row_key), self.bigtable_extract_row_data( - row + yield ( + self.get_access_key(row.row_key), + self.bigtable_extract_row_data(row), ) except Exception as ex: self.log.error( From 06ff1aead055494d5df9f8bdd8baf4c0a6bb3586 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 14:04:06 +0200 Subject: [PATCH 016/616] corrected typing --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 523ca93fc..5dbde66a3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -21,7 +21,7 @@ class BigTableStore(base.SerializedStore): client: Client instance: Instance - table: Table + bt_table: Table PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" TABLE_NAME_KEY = "table_name_key" From 8ac8c49d81b111a3df5ff8d3e3bea4fc888ce3ce Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 16:04:25 +0200 Subject: [PATCH 017/616] added logging and removed some stuff --- faust/stores/bigtable.py | 90 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5dbde66a3..4fd70087c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -13,8 +13,6 @@ from faust.stores import base from faust.types import TP, AppT, CollectionT -bigtable = None - class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -57,7 +55,7 @@ def __init__( table.use_partitioner = True except Exception as ex: - self.logger.error(f"Error configuring bigtable client {ex}") + self.log.error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) @@ -89,7 +87,13 @@ def _get(self, key: bytes) -> Optional[bytes]: end_inclusive=True, filter_=filter, ) + self.log.info( + f"[Bigtable]: _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) if res is None: + self.log.warning( + f"[Bigtable] KeyError in _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) except ValueError as ex: @@ -107,6 +111,9 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: row = self.bt_table.direct_row(bt_key) row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() + self.log.info( + f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -120,6 +127,9 @@ def _del(self, key: bytes) -> None: row = self.bt_table.direct_row(bt_key) row.delete() row.commit() + self.log.info( + f"[Bigtable]: _del with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -254,7 +264,79 @@ def __init__( self.column_name = "DATA" except Exception as ex: - self.logger.error(f"Error configuring bigtable client {ex}") + logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") + raise ex + + def bigtable_extract_row_data(self, row_data): + return list(row_data.to_dict().values())[0][0].value + + def get_bigtable_key(self, key: bytes) -> bytes: + decoded_key = key.decode("utf-8") + return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") + + def get_access_key(self, bt_key: bytes) -> bytes: + return bytes( + bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" + ) + + def sx_get_key_prefix(self, key: bytes) -> bytes: + key_id = key.decode("utf-8").split("_")[1] + return f"{self.table_name}_{key_id}".encode("utf-8") + + def _get(self, key: bytes) -> Optional[bytes]: + filter = CellsColumnLimitFilter(1) + try: + bt_key = self.get_bigtable_key(key) + key_prefix = self.sx_get_key_prefix(bt_key) + res = self.bt_table.read_row( + bt_key, + start_key=key_prefix, + end_key=key_prefix, + end_inclusive=True, + filter_=filter, + ) + if res is None: + raise KeyError(f"row {key} not found in bigtable {self.table=}") + return self.bigtable_extract_row_data(res) + except ValueError as ex: + self.log.debug(f"key not found {key} exception {ex}") + raise KeyError(f"key not found {key}") + except Exception as ex: + self.log.error( + f"Error in get for table {self.table_name} exception {ex} key {key}" + ) + raise ex + + def _set(self, key: bytes, value: Optional[bytes]) -> None: + try: + bt_key = self.get_bigtable_key(key) + row = self.bt_table.direct_row(bt_key) + self.log.info( + f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) + row.set_cell(self.column_family.column_family_id, self.column_name, value) + row.commit() + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in set for " + f"table {self.table_name} exception {ex} key {key}" + ) + raise ex + + def _del(self, key: bytes) -> None: + try: + bt_key = self.get_bigtable_key(key) + row = self.bt_table.direct_row(bt_key) + self.log.info( + f"[Bigtable]: _del with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) + row.delete() + row.commit() + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in delete for " + f"table {self.table_name} exception {ex} key {key}" + ) raise ex From 374cffc9e4f3b136530d6ad461160685bc8e6379 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 16:32:26 +0200 Subject: [PATCH 018/616] fixed wrong reads --- faust/stores/bigtable.py | 94 +++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4fd70087c..f8565daac 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -64,27 +64,17 @@ def bigtable_extract_row_data(self, row_data): def get_bigtable_key(self, key: bytes) -> bytes: decoded_key = key.decode("utf-8") - return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") + return f"{self.table_name}_{decoded_key}".encode("utf-8") def get_access_key(self, bt_key: bytes) -> bytes: - return bytes( - bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" - ) - - def sx_get_key_prefix(self, key: bytes) -> bytes: - key_id = key.decode("utf-8").split("_")[1] - return f"{self.table_name}_{key_id}".encode("utf-8") + return bt_key.decode("utf-8").removeprefix(f"{self.table_name}_").encode("utf-8") def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: bt_key = self.get_bigtable_key(key) - key_prefix = self.sx_get_key_prefix(bt_key) res = self.bt_table.read_row( bt_key, - start_key=key_prefix, - end_key=key_prefix, - end_inclusive=True, filter_=filter, ) self.log.info( @@ -162,9 +152,11 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - table_prefix = f"{self.table_name}".encode("utf-8") + end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) + end_key = end_key_str.encode("utf-8") + start_key = self.table_name.encode("utf-8") for row in self.bt_table.read_rows( - start_key=table_prefix, end_key=table_prefix, end_inclusive=True + start_key=start_key, end_key=end_key ): yield ( self.get_access_key(row.row_key), @@ -250,11 +242,13 @@ def __init__( options: Optional[typing.Dict[str, Any]] = None, ) -> None: try: + self.table_name = "TESTMEPLS" self.client = Client(options.get(BigTableStore.PROJECT_KEY)) self.instance = self.client.instance( options.get(BigTableStore.INSTANCE_KEY) ) + self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) self.bt_table = self.instance.table(self.bt_table_name) column_family_id = "FaustColumnFamily" self.column_family = self.bt_table.column_family( @@ -279,30 +273,28 @@ def get_access_key(self, bt_key: bytes) -> bytes: bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" ) - def sx_get_key_prefix(self, key: bytes) -> bytes: - key_id = key.decode("utf-8").split("_")[1] - return f"{self.table_name}_{key_id}".encode("utf-8") - def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: bt_key = self.get_bigtable_key(key) - key_prefix = self.sx_get_key_prefix(bt_key) res = self.bt_table.read_row( bt_key, - start_key=key_prefix, - end_key=key_prefix, - end_inclusive=True, filter_=filter, ) + print( + f"[Bigtable]: _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) if res is None: + print( + f"[Bigtable] KeyError in _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) except ValueError as ex: - self.log.debug(f"key not found {key} exception {ex}") + print(f"key not found {key} exception {ex}") raise KeyError(f"key not found {key}") except Exception as ex: - self.log.error( + print( f"Error in get for table {self.table_name} exception {ex} key {key}" ) raise ex @@ -311,11 +303,11 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: try: bt_key = self.get_bigtable_key(key) row = self.bt_table.direct_row(bt_key) - self.log.info( - f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() + print( + f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -327,18 +319,61 @@ def _del(self, key: bytes) -> None: try: bt_key = self.get_bigtable_key(key) row = self.bt_table.direct_row(bt_key) - self.log.info( + print( f"[Bigtable]: _del with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" ) row.delete() row.commit() except Exception as ex: - self.log.error( + print( f"FaustBigtableException Error in delete for " f"table {self.table_name} exception {ex} key {key}" ) raise ex + def _iterkeys(self) -> Iterator[bytes]: + try: + for row in self._iteritems(): + yield row[0] + except Exception as ex: + print( + f"FaustBigtableException Error in _iterkeys " + f"for table {self.table_name} exception {ex}" + ) + raise ex + + def _itervalues(self) -> Iterator[bytes]: + try: + for row in self._iteritems(): + yield row[1] + except Exception as ex: + print( + f"FaustBigtableException Error " + f"in _itervalues for table {self.table_name}" + f" exception {ex}" + ) + raise ex + + def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: + try: + end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) + end_key = end_key_str.encode("utf-8") + start_key = self.table_name.encode("utf-8") + for row in self.bt_table.read_rows( + start_key=start_key, end_key=end_key + ): + yield ( + self.get_access_key(row.row_key), + self.bigtable_extract_row_data(row), + ) + except Exception as ex: + print( + f"FaustBigtableException Error " + f"in _iteritems for table {self.table_name}" + f" exception {ex}" + ) + raise ex + if __name__ == "__main__": options = { @@ -346,5 +381,6 @@ def _del(self, key: bytes) -> None: BigTableStoreTest.INSTANCE_KEY: "faust-cache-test", BigTableStoreTest.TABLE_NAME_KEY: "sxfaust_cache", } + key = "aaaa_123_bbbb".encode("utf-8") store = BigTableStoreTest(options) pass From d9d96c7b9debd186d96056d52c22e327bc5b0397 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Sep 2022 16:57:57 +0200 Subject: [PATCH 019/616] fixed wrong decoded key --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f8565daac..69dc39888 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -78,7 +78,7 @@ def _get(self, key: bytes) -> Optional[bytes]: filter_=filter, ) self.log.info( - f"[Bigtable]: _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + f"[Bigtable]: _get with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" ) if res is None: self.log.warning( @@ -102,7 +102,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() self.log.info( - f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + f"[Bigtable]: _set with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" ) except Exception as ex: self.log.error( @@ -118,7 +118,7 @@ def _del(self, key: bytes) -> None: row.delete() row.commit() self.log.info( - f"[Bigtable]: _del with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" + f"[Bigtable]: _del with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" ) except Exception as ex: self.log.error( From d9b55f63188f628e9c7b0018d70aa9ffac671443 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 07:40:03 +0200 Subject: [PATCH 020/616] removed remove prefix as it is only available in python 3.9 --- faust/stores/bigtable.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 69dc39888..b551be723 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -269,9 +269,11 @@ def get_bigtable_key(self, key: bytes) -> bytes: return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") def get_access_key(self, bt_key: bytes) -> bytes: - return bytes( - bt_key.decode("utf-8").removeprefix(f"{self.table_name}_"), encoding="utf-8" - ) + prefix = f"{self.table_name}_" + bt_key_str = bt_key.decode("utf-8") + if bt_key_str.startswith(prefix): + bt_key_str = bt_key_str[len(prefix)]:] + return bt_key_str.encode("utf-8") def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) From 258bda8aabdbac8953de5ecf1b83e8911fc97e65 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 07:41:26 +0200 Subject: [PATCH 021/616] changed encoding of get_bigtable_key --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b551be723..aec5f5da0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -266,7 +266,7 @@ def bigtable_extract_row_data(self, row_data): def get_bigtable_key(self, key: bytes) -> bytes: decoded_key = key.decode("utf-8") - return bytes(f"{self.table_name}_{decoded_key}", encoding="utf-8") + return f"{self.table_name}_{decoded_key}".encode("utf-8") def get_access_key(self, bt_key: bytes) -> bytes: prefix = f"{self.table_name}_" From 94aab0e893be172120c9ce59a3bcb2ae254ad6cd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 08:06:23 +0200 Subject: [PATCH 022/616] fixed wrong syntax --- faust/stores/bigtable.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index aec5f5da0..7b25b708c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -67,7 +67,9 @@ def get_bigtable_key(self, key: bytes) -> bytes: return f"{self.table_name}_{decoded_key}".encode("utf-8") def get_access_key(self, bt_key: bytes) -> bytes: - return bt_key.decode("utf-8").removeprefix(f"{self.table_name}_").encode("utf-8") + return ( + bt_key.decode("utf-8").removeprefix(f"{self.table_name}_").encode("utf-8") + ) def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) @@ -155,9 +157,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) end_key = end_key_str.encode("utf-8") start_key = self.table_name.encode("utf-8") - for row in self.bt_table.read_rows( - start_key=start_key, end_key=end_key - ): + for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): yield ( self.get_access_key(row.row_key), self.bigtable_extract_row_data(row), @@ -272,7 +272,7 @@ def get_access_key(self, bt_key: bytes) -> bytes: prefix = f"{self.table_name}_" bt_key_str = bt_key.decode("utf-8") if bt_key_str.startswith(prefix): - bt_key_str = bt_key_str[len(prefix)]:] + bt_key_str = bt_key_str[len(prefix) :] return bt_key_str.encode("utf-8") def _get(self, key: bytes) -> Optional[bytes]: @@ -296,9 +296,7 @@ def _get(self, key: bytes) -> Optional[bytes]: print(f"key not found {key} exception {ex}") raise KeyError(f"key not found {key}") except Exception as ex: - print( - f"Error in get for table {self.table_name} exception {ex} key {key}" - ) + print(f"Error in get for table {self.table_name} exception {ex} key {key}") raise ex def _set(self, key: bytes, value: Optional[bytes]) -> None: @@ -361,9 +359,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) end_key = end_key_str.encode("utf-8") start_key = self.table_name.encode("utf-8") - for row in self.bt_table.read_rows( - start_key=start_key, end_key=end_key - ): + for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): yield ( self.get_access_key(row.row_key), self.bigtable_extract_row_data(row), From 9a68744dfa990dcac10f507ae090990dc8da0611 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 11:40:02 +0200 Subject: [PATCH 023/616] added faster reading of changelogs --- faust/stores/bigtable.py | 113 ++++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 20 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7b25b708c..ad5bc7e7f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,13 +1,14 @@ """BigTable storage.""" import logging import typing -from typing import Any, Iterator, Optional, Tuple, Union +from typing import Any, Iterator, Optional, Tuple, Union, List, Dict from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.table import Table + from yarl import URL from faust.stores import base @@ -33,6 +34,7 @@ def __init__( **kwargs: Any, ) -> None: self.table_name = table.name + self.offset_key_prefix = f"{self.table_name}_offsets:".encode() try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" @@ -63,13 +65,15 @@ def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def get_bigtable_key(self, key: bytes) -> bytes: - decoded_key = key.decode("utf-8") - return f"{self.table_name}_{decoded_key}".encode("utf-8") + decoded_key = key.decode() + return f"{self.table_name}_{decoded_key}".encode() def get_access_key(self, bt_key: bytes) -> bytes: - return ( - bt_key.decode("utf-8").removeprefix(f"{self.table_name}_").encode("utf-8") - ) + prefix = f"{self.table_name}_" + bt_key_str = bt_key.decode() + if bt_key_str.startswith(prefix): + bt_key_str = bt_key_str[len(prefix) :] + return bt_key_str.encode() def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) @@ -155,8 +159,8 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) - end_key = end_key_str.encode("utf-8") - start_key = self.table_name.encode("utf-8") + end_key = end_key_str.encode() + start_key = self.table_name.encode() for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): yield ( self.get_access_key(row.row_key), @@ -204,12 +208,35 @@ def reset_state(self) -> None: """ ... - def persisted_offset(self, tp: TP) -> Optional[int]: - """Return the persisted offset. - - This always returns :const:`None` when using the bigtable store. + def get_offset_key(self, tp: TP): + return self.offset_key_prefix + str(tp.partition).encode() + + # def persisted_offset(self, tp: TP) -> Optional[int]: + # """Return the last persisted offset. +# + # See :meth:`set_persisted_offset`. + # """ + # offset_key = self.offset_key_prefix + str(tp.partition).encode() + # try: + # offset = self._get(offset_key) + # except KeyError: + # offset = None + # pass + # if offset is not None: + # return int(offset) + # return None + + def set_persisted_offset(self, tp: TP, offset: int) -> None: + """Set the last persisted offset for this table. + + This will remember the last offset that we wrote to RocksDB, + so that on rebalance/recovery we can seek past this point + to only read the events that occurred recently while + we were not an active replica. """ - return None + offset_key = self.get_offset_key(tp) + pass + # self._set(offset_key, str(offset).encode()) async def backup_partition( self, @@ -235,6 +262,49 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") + def apply_changelog_batch( + self, + batch: Iterable[EventT], + to_key: Callable[[Any], Any], + to_value: Callable[[Any], Any], + ) -> None: + """Write batch of changelog events to local RocksDB storage. + + Arguments: + batch: Iterable of changelog events (:class:`faust.Event`) + to_key: A callable you can use to deserialize the key + of a changelog event. + to_value: A callable you can use to deserialize the value + of a changelog event. + """ + tp_offsets: Dict[TP, int] = {} + row_mutations = [] + for event in batch: + tp, offset = event.message.tp, event.message.offset + tp_offsets[tp] = ( + offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) + ) + msg = event.message + + bt_key = self.get_bigtable_key(msg.key) + row = self.bt_table.direct_row(bt_key) + if msg.value is None: + row.delete() + else: + row.set_cell( + self.column_family.column_family_id, + self.column_name, + msg.value, + ) + row_mutations.append(row) + response = self.bt_table.mutate_rows(row_mutations) + for i, status in enumerate(response): + if status.code != 0: + self.log.error("Row number {} failed to write".format(i)) + + for tp, offset in tp_offsets.items(): + self.set_persisted_offset(tp, offset) + class BigTableStoreTest(BigTableStore): def __init__( @@ -265,15 +335,15 @@ def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def get_bigtable_key(self, key: bytes) -> bytes: - decoded_key = key.decode("utf-8") - return f"{self.table_name}_{decoded_key}".encode("utf-8") + decoded_key = key.decode() + return f"{self.table_name}_{decoded_key}".encode() def get_access_key(self, bt_key: bytes) -> bytes: prefix = f"{self.table_name}_" - bt_key_str = bt_key.decode("utf-8") + bt_key_str = bt_key.decode() if bt_key_str.startswith(prefix): bt_key_str = bt_key_str[len(prefix) :] - return bt_key_str.encode("utf-8") + return bt_key_str.encode() def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) @@ -357,8 +427,8 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) - end_key = end_key_str.encode("utf-8") - start_key = self.table_name.encode("utf-8") + end_key = end_key_str.encode() + start_key = self.table_name.encode() for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): yield ( self.get_access_key(row.row_key), @@ -379,6 +449,9 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: BigTableStoreTest.INSTANCE_KEY: "faust-cache-test", BigTableStoreTest.TABLE_NAME_KEY: "sxfaust_cache", } - key = "aaaa_123_bbbb".encode("utf-8") + key = "aaaa_123_bbbb".encode() store = BigTableStoreTest(options) + bt_key = store.get_bigtable_key(key) + key_later = store.get_access_key(bt_key) + assert key == key_later pass From 2d855419f6e51f6516181316ebba2d7a84cdb137 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 13:41:49 +0200 Subject: [PATCH 024/616] fixed imports --- faust/stores/bigtable.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ad5bc7e7f..d1c97e0dd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,7 +1,7 @@ """BigTable storage.""" import logging import typing -from typing import Any, Iterator, Optional, Tuple, Union, List, Dict +from typing import Any, Callable, Iterable, Iterator, Optional, Tuple, Union, List, Dict from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -12,7 +12,7 @@ from yarl import URL from faust.stores import base -from faust.types import TP, AppT, CollectionT +from faust.types import TP, AppT, CollectionT, EventT class BigTableStore(base.SerializedStore): @@ -212,19 +212,19 @@ def get_offset_key(self, tp: TP): return self.offset_key_prefix + str(tp.partition).encode() # def persisted_offset(self, tp: TP) -> Optional[int]: - # """Return the last persisted offset. -# - # See :meth:`set_persisted_offset`. - # """ - # offset_key = self.offset_key_prefix + str(tp.partition).encode() - # try: - # offset = self._get(offset_key) - # except KeyError: - # offset = None - # pass - # if offset is not None: - # return int(offset) - # return None + # """Return the last persisted offset. + # + # See :meth:`set_persisted_offset`. + # """ + # offset_key = self.offset_key_prefix + str(tp.partition).encode() + # try: + # offset = self._get(offset_key) + # except KeyError: + # offset = None + # pass + # if offset is not None: + # return int(offset) + # return None def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. From 21d10eecfd23135c0e8927e7f02ba28fa9dd3f5a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 13:57:30 +0200 Subject: [PATCH 025/616] added persisted offset method again --- faust/stores/bigtable.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d1c97e0dd..6fbc66181 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -211,20 +211,19 @@ def reset_state(self) -> None: def get_offset_key(self, tp: TP): return self.offset_key_prefix + str(tp.partition).encode() - # def persisted_offset(self, tp: TP) -> Optional[int]: - # """Return the last persisted offset. - # - # See :meth:`set_persisted_offset`. - # """ - # offset_key = self.offset_key_prefix + str(tp.partition).encode() - # try: - # offset = self._get(offset_key) - # except KeyError: - # offset = None - # pass - # if offset is not None: - # return int(offset) - # return None + def persisted_offset(self, tp: TP) -> Optional[int]: + """Return the last persisted offset. + See :meth:`set_persisted_offset`. + """ + offset_key = self.offset_key_prefix + str(tp.partition).encode() + try: + offset = self._get(offset_key) + except KeyError: + offset = None + pass + if offset is not None: + return int(offset) + return None def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. From c2ae3982f36f35907c3b67f1073a90c6575c48c7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 14:59:25 +0200 Subject: [PATCH 026/616] moved changelog persistation to its own function --- faust/stores/bigtable.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6fbc66181..cb892a1d7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -220,7 +220,6 @@ def persisted_offset(self, tp: TP) -> Optional[int]: offset = self._get(offset_key) except KeyError: offset = None - pass if offset is not None: return int(offset) return None @@ -234,8 +233,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ offset_key = self.get_offset_key(tp) - pass - # self._set(offset_key, str(offset).encode()) + self._set(offset_key, str(offset).encode()) async def backup_partition( self, @@ -261,6 +259,15 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") + def _persist_changelog_batch(self, row_mutations, tp_offsets): + response = self.bt_table.mutate_rows(row_mutations) + for i, status in enumerate(response): + if status.code != 0: + self.log.error("Row number {} failed to write".format(i)) + + for tp, offset in tp_offsets.items(): + self.set_persisted_offset(tp, offset) + def apply_changelog_batch( self, batch: Iterable[EventT], @@ -296,14 +303,10 @@ def apply_changelog_batch( msg.value, ) row_mutations.append(row) - response = self.bt_table.mutate_rows(row_mutations) - for i, status in enumerate(response): - if status.code != 0: - self.log.error("Row number {} failed to write".format(i)) - - for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset) - + self._persist_changelog_batch( + row_mutations, + tp_offsets, + ) class BigTableStoreTest(BigTableStore): def __init__( From cb5e6b6b952121589466ae322f6d925b713a5214 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Sep 2022 15:42:50 +0200 Subject: [PATCH 027/616] removed logs --- faust/stores/bigtable.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cb892a1d7..517ccdebc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -83,9 +83,6 @@ def _get(self, key: bytes) -> Optional[bytes]: bt_key, filter_=filter, ) - self.log.info( - f"[Bigtable]: _get with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) if res is None: self.log.warning( f"[Bigtable] KeyError in _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" @@ -107,9 +104,6 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: row = self.bt_table.direct_row(bt_key) row.set_cell(self.column_family.column_family_id, self.column_name, value) row.commit() - self.log.info( - f"[Bigtable]: _set with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -123,9 +117,6 @@ def _del(self, key: bytes) -> None: row = self.bt_table.direct_row(bt_key) row.delete() row.commit() - self.log.info( - f"[Bigtable]: _del with {key=} (={key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " From 86291d536f825ecaadf3716aea3b29f823cf96e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 10:51:38 +0200 Subject: [PATCH 028/616] new version with more tables and better naming --- faust/stores/bigtable.py | 231 ++++++--------------------------------- 1 file changed, 36 insertions(+), 195 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 517ccdebc..734dbe342 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,14 +1,13 @@ """BigTable storage.""" import logging import typing -from typing import Any, Callable, Iterable, Iterator, Optional, Tuple, Union, List, Dict +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.table import Table - from yarl import URL from faust.stores import base @@ -23,7 +22,6 @@ class BigTableStore(base.SerializedStore): bt_table: Table PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" - TABLE_NAME_KEY = "table_name_key" def __init__( self, @@ -33,26 +31,35 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: - self.table_name = table.name - self.offset_key_prefix = f"{self.table_name}_offsets:".encode() + self.offset_key_prefix = "offsets:".encode() try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" ) - self.client = Client( + self.client: Client = Client( options.get(BigTableStore.PROJECT_KEY), + admin=True, ) - self.instance = self.client.instance( + self.instance: Instance = self.client.instance( options.get(BigTableStore.INSTANCE_KEY) ) - self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) - self.bt_table = self.instance.table(self.bt_table_name) + self.bt_table: Table = self.instance.table( + table.changelog_topic.get_topic_name() + ) + if not self.bt_table.exists(): + self.bt_table.create() + column_family_id = "FaustColumnFamily" - self.column_family = self.bt_table.column_family( - column_family_id, - gc_rule=column_family.MaxVersionsGCRule(1), + self.column_family: column_family.ColumnFamily = ( + self.bt_table.column_family( + column_family_id, + gc_rule=column_family.MaxVersionsGCRule(1), + ) ) + column_families = list(self.bt_table.list_column_families().keys()) + if self.column_family not in column_families: + self.column_family.create() self.column_name = "DATA" table.use_partitioner = True @@ -64,29 +71,15 @@ def __init__( def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value - def get_bigtable_key(self, key: bytes) -> bytes: - decoded_key = key.decode() - return f"{self.table_name}_{decoded_key}".encode() - - def get_access_key(self, bt_key: bytes) -> bytes: - prefix = f"{self.table_name}_" - bt_key_str = bt_key.decode() - if bt_key_str.startswith(prefix): - bt_key_str = bt_key_str[len(prefix) :] - return bt_key_str.encode() - def _get(self, key: bytes) -> Optional[bytes]: filter = CellsColumnLimitFilter(1) try: - bt_key = self.get_bigtable_key(key) res = self.bt_table.read_row( - bt_key, + key, filter_=filter, ) if res is None: - self.log.warning( - f"[Bigtable] KeyError in _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) + self.log.warning(f"[Bigtable] KeyError in _get with {key=}") raise KeyError(f"row {key} not found in bigtable {self.table=}") return self.bigtable_extract_row_data(res) except ValueError as ex: @@ -100,9 +93,12 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - bt_key = self.get_bigtable_key(key) - row = self.bt_table.direct_row(bt_key) - row.set_cell(self.column_family.column_family_id, self.column_name, value) + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family.column_family_id, + self.column_name, + value, + ) row.commit() except Exception as ex: self.log.error( @@ -113,8 +109,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - bt_key = self.get_bigtable_key(key) - row = self.bt_table.direct_row(bt_key) + row = self.bt_table.direct_row(key) row.delete() row.commit() except Exception as ex: @@ -149,12 +144,9 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) - end_key = end_key_str.encode() - start_key = self.table_name.encode() - for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): + for row in self.bt_table.read_rows(): yield ( - self.get_access_key(row.row_key), + row.row_key, self.bigtable_extract_row_data(row), ) except Exception as ex: @@ -171,10 +163,11 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - for k in self._iterkeys(): - if k == key: - return True - return False + res = self.bt_table.read_row( + key, + filter_=filter, + ) + return res is not None except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " @@ -282,9 +275,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - - bt_key = self.get_bigtable_key(msg.key) - row = self.bt_table.direct_row(bt_key) + row = self.bt_table.direct_row(msg.key) if msg.value is None: row.delete() else: @@ -298,153 +289,3 @@ def apply_changelog_batch( row_mutations, tp_offsets, ) - -class BigTableStoreTest(BigTableStore): - def __init__( - self, - options: Optional[typing.Dict[str, Any]] = None, - ) -> None: - try: - self.table_name = "TESTMEPLS" - self.client = Client(options.get(BigTableStore.PROJECT_KEY)) - self.instance = self.client.instance( - options.get(BigTableStore.INSTANCE_KEY) - ) - - self.bt_table_name = options.get(BigTableStore.TABLE_NAME_KEY) - self.bt_table = self.instance.table(self.bt_table_name) - column_family_id = "FaustColumnFamily" - self.column_family = self.bt_table.column_family( - column_family_id, - gc_rule=column_family.MaxVersionsGCRule(1), - ) - self.column_name = "DATA" - - except Exception as ex: - logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") - raise ex - - def bigtable_extract_row_data(self, row_data): - return list(row_data.to_dict().values())[0][0].value - - def get_bigtable_key(self, key: bytes) -> bytes: - decoded_key = key.decode() - return f"{self.table_name}_{decoded_key}".encode() - - def get_access_key(self, bt_key: bytes) -> bytes: - prefix = f"{self.table_name}_" - bt_key_str = bt_key.decode() - if bt_key_str.startswith(prefix): - bt_key_str = bt_key_str[len(prefix) :] - return bt_key_str.encode() - - def _get(self, key: bytes) -> Optional[bytes]: - filter = CellsColumnLimitFilter(1) - try: - bt_key = self.get_bigtable_key(key) - res = self.bt_table.read_row( - bt_key, - filter_=filter, - ) - print( - f"[Bigtable]: _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) - if res is None: - print( - f"[Bigtable] KeyError in _get with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) - raise KeyError(f"row {key} not found in bigtable {self.table=}") - return self.bigtable_extract_row_data(res) - except ValueError as ex: - print(f"key not found {key} exception {ex}") - raise KeyError(f"key not found {key}") - except Exception as ex: - print(f"Error in get for table {self.table_name} exception {ex} key {key}") - raise ex - - def _set(self, key: bytes, value: Optional[bytes]) -> None: - try: - bt_key = self.get_bigtable_key(key) - row = self.bt_table.direct_row(bt_key) - row.set_cell(self.column_family.column_family_id, self.column_name, value) - row.commit() - print( - f"[Bigtable]: _set with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) - except Exception as ex: - self.log.error( - f"FaustBigtableException Error in set for " - f"table {self.table_name} exception {ex} key {key}" - ) - raise ex - - def _del(self, key: bytes) -> None: - try: - bt_key = self.get_bigtable_key(key) - row = self.bt_table.direct_row(bt_key) - print( - f"[Bigtable]: _del with {key=} (={bt_key.decode('utf-8')}) -> {bt_key=} (={bt_key.decode('utf-8')})" - ) - row.delete() - row.commit() - except Exception as ex: - print( - f"FaustBigtableException Error in delete for " - f"table {self.table_name} exception {ex} key {key}" - ) - raise ex - - def _iterkeys(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[0] - except Exception as ex: - print( - f"FaustBigtableException Error in _iterkeys " - f"for table {self.table_name} exception {ex}" - ) - raise ex - - def _itervalues(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[1] - except Exception as ex: - print( - f"FaustBigtableException Error " - f"in _itervalues for table {self.table_name}" - f" exception {ex}" - ) - raise ex - - def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: - try: - end_key_str = self.table_name[:-1] + chr(ord(self.table_name[-1]) + 1) - end_key = end_key_str.encode() - start_key = self.table_name.encode() - for row in self.bt_table.read_rows(start_key=start_key, end_key=end_key): - yield ( - self.get_access_key(row.row_key), - self.bigtable_extract_row_data(row), - ) - except Exception as ex: - print( - f"FaustBigtableException Error " - f"in _iteritems for table {self.table_name}" - f" exception {ex}" - ) - raise ex - - -if __name__ == "__main__": - options = { - BigTableStoreTest.PROJECT_KEY: "smaxtec-system", - BigTableStoreTest.INSTANCE_KEY: "faust-cache-test", - BigTableStoreTest.TABLE_NAME_KEY: "sxfaust_cache", - } - key = "aaaa_123_bbbb".encode() - store = BigTableStoreTest(options) - bt_key = store.get_bigtable_key(key) - key_later = store.get_access_key(bt_key) - assert key == key_later - pass From cf584c80ca8c7076faf381d6c50717d4ba495405 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 10:55:50 +0200 Subject: [PATCH 029/616] better table-names --- faust/stores/bigtable.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 734dbe342..c4aa5227c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -32,6 +32,9 @@ def __init__( **kwargs: Any, ) -> None: self.offset_key_prefix = "offsets:".encode() + self.table_name = table.changelog_topic.get_topic_name().replace( + "-changelog", "" + ) try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" @@ -44,9 +47,7 @@ def __init__( options.get(BigTableStore.INSTANCE_KEY) ) - self.bt_table: Table = self.instance.table( - table.changelog_topic.get_topic_name() - ) + self.bt_table: Table = self.instance.table(self.table_name) if not self.bt_table.exists(): self.bt_table.create() From 1d724c578f8823de5b4059edca7d17777a80a503 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 11:17:50 +0200 Subject: [PATCH 030/616] changed naming for offsets --- faust/stores/bigtable.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c4aa5227c..4a88af1a4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -31,7 +31,7 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: - self.offset_key_prefix = "offsets:".encode() + self.offset_key_prefix = "changelog_offset:".encode() self.table_name = table.changelog_topic.get_topic_name().replace( "-changelog", "" ) @@ -48,18 +48,12 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) + column_families = list(self.bt_table.list_column_families().keys()) + if self.column_family not in column_families: if not self.bt_table.exists(): self.bt_table.create() column_family_id = "FaustColumnFamily" - self.column_family: column_family.ColumnFamily = ( - self.bt_table.column_family( - column_family_id, - gc_rule=column_family.MaxVersionsGCRule(1), - ) - ) - column_families = list(self.bt_table.list_column_families().keys()) - if self.column_family not in column_families: self.column_family.create() self.column_name = "DATA" From 15ecc54c872b69aa3b24e1fdcc14fb14d0d46d80 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 11:40:38 +0200 Subject: [PATCH 031/616] fixed indentation --- faust/stores/bigtable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4a88af1a4..4f14964e0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -48,12 +48,18 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) - column_families = list(self.bt_table.list_column_families().keys()) - if self.column_family not in column_families: if not self.bt_table.exists(): self.bt_table.create() column_family_id = "FaustColumnFamily" + self.column_family: column_family.ColumnFamily = ( + self.bt_table.column_family( + column_family_id, + gc_rule=column_family.MaxVersionsGCRule(1), + ) + ) + column_families = list(self.bt_table.list_column_families().keys()) + if self.column_family not in column_families: self.column_family.create() self.column_name = "DATA" From 8d12b7551ed56d396ff7153604d5c7ce4f30caa6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 12:24:22 +0200 Subject: [PATCH 032/616] added table name generator option --- faust/stores/bigtable.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4f14964e0..15df0b79e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -22,6 +22,7 @@ class BigTableStore(base.SerializedStore): bt_table: Table PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" + BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" def __init__( self, @@ -31,6 +32,10 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: + table_name_generator = options.get( + BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name + ) + self.table_name = table_name_generator(table) self.offset_key_prefix = "changelog_offset:".encode() self.table_name = table.changelog_topic.get_topic_name().replace( "-changelog", "" From 77f822516813f109b9812cd3f63a9e12b4e1184e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 16:22:29 +0200 Subject: [PATCH 033/616] fixed table_name --- faust/stores/bigtable.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 15df0b79e..705bd8f52 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -37,9 +37,6 @@ def __init__( ) self.table_name = table_name_generator(table) self.offset_key_prefix = "changelog_offset:".encode() - self.table_name = table.changelog_topic.get_topic_name().replace( - "-changelog", "" - ) try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" From dc675ccf76c822e0f8d88273b79cbe547e170f6f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 16:52:58 +0200 Subject: [PATCH 034/616] fixed column family --- faust/stores/bigtable.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 705bd8f52..58ba0ac45 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -51,18 +51,12 @@ def __init__( self.bt_table: Table = self.instance.table(self.table_name) if not self.bt_table.exists(): - self.bt_table.create() - - column_family_id = "FaustColumnFamily" - self.column_family: column_family.ColumnFamily = ( - self.bt_table.column_family( - column_family_id, - gc_rule=column_family.MaxVersionsGCRule(1), + self.bt_table.create( + column_families={ + "FaustColumnFamily": column_family.MaxVersionsGCRule(1) + } ) - ) - column_families = list(self.bt_table.list_column_families().keys()) - if self.column_family not in column_families: - self.column_family.create() + self.column_family_id = self.bt_table.column_family.column_family_id self.column_name = "DATA" table.use_partitioner = True @@ -98,7 +92,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: try: row = self.bt_table.direct_row(key) row.set_cell( - self.column_family.column_family_id, + self.column_family_id, self.column_name, value, ) From b3df94162e1d74460e948938d14ddc7fd9c37f65 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 17:06:03 +0200 Subject: [PATCH 035/616] added logging configuration for start --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 58ba0ac45..9581e5483 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -61,7 +61,7 @@ def __init__( table.use_partitioner = True except Exception as ex: - self.log.error(f"Error configuring bigtable client {ex}") + logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) From be694e9931aa55849345819fc0ccc85e1aedf896 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 17:25:42 +0200 Subject: [PATCH 036/616] added column family name hardcoded will change to option later --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9581e5483..c644d3119 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -50,13 +50,13 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) + self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): self.bt_table.create( column_families={ - "FaustColumnFamily": column_family.MaxVersionsGCRule(1) + self.column_family_id: column_family.MaxVersionsGCRule(1) } ) - self.column_family_id = self.bt_table.column_family.column_family_id self.column_name = "DATA" table.use_partitioner = True From 15dc3a86e4642eb0f94c677bd77714700504a1ff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 17:42:38 +0200 Subject: [PATCH 037/616] fixed wrong access to data --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c644d3119..c01847043 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -277,7 +277,7 @@ def apply_changelog_batch( row.delete() else: row.set_cell( - self.column_family.column_family_id, + self.column_family_id, self.column_name, msg.value, ) From 4e3f6b8d3922ac759e87211a1736a19a37397fb6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 18:22:00 +0200 Subject: [PATCH 038/616] fixed filter in get --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c01847043..e2c9280e8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -51,6 +51,7 @@ def __init__( self.bt_table: Table = self.instance.table(self.table_name) self.column_family_id = "FaustColumnFamily" + self.row_filter = CellsColumnLimitFilter(1) if not self.bt_table.exists(): self.bt_table.create( column_families={ @@ -69,11 +70,10 @@ def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def _get(self, key: bytes) -> Optional[bytes]: - filter = CellsColumnLimitFilter(1) try: res = self.bt_table.read_row( key, - filter_=filter, + filter_=self.row_filter ) if res is None: self.log.warning(f"[Bigtable] KeyError in _get with {key=}") @@ -162,7 +162,7 @@ def _contains(self, key: bytes) -> bool: try: res = self.bt_table.read_row( key, - filter_=filter, + filter_=self.row_filter ) return res is not None except Exception as ex: From d694f5dce93dd5e232573b401c2144064074050d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Sep 2022 18:58:07 +0200 Subject: [PATCH 039/616] fixed iter method --- faust/stores/bigtable.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e2c9280e8..017730f3d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -12,6 +12,7 @@ from faust.stores import base from faust.types import TP, AppT, CollectionT, EventT +from faust.types.tables import KT class BigTableStore(base.SerializedStore): @@ -173,6 +174,10 @@ def _contains(self, key: bytes) -> bool: ) raise ex + def __iter__(self) -> Iterator[KT]: + for k in self._iterkeys(): + yield k.decode() + def _clear(self) -> None: """This is typically used to clear data. From f315dd5eaf62812c96c70b8374a09ef9440b606f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Sep 2022 07:01:49 +0200 Subject: [PATCH 040/616] removed iter again --- faust/stores/bigtable.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 017730f3d..f9cc06d66 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -12,7 +12,6 @@ from faust.stores import base from faust.types import TP, AppT, CollectionT, EventT -from faust.types.tables import KT class BigTableStore(base.SerializedStore): @@ -72,10 +71,7 @@ def bigtable_extract_row_data(self, row_data): def _get(self, key: bytes) -> Optional[bytes]: try: - res = self.bt_table.read_row( - key, - filter_=self.row_filter - ) + res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: self.log.warning(f"[Bigtable] KeyError in _get with {key=}") raise KeyError(f"row {key} not found in bigtable {self.table=}") @@ -161,10 +157,7 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - res = self.bt_table.read_row( - key, - filter_=self.row_filter - ) + res = self.bt_table.read_row(key, filter_=self.row_filter) return res is not None except Exception as ex: self.log.error( @@ -174,10 +167,6 @@ def _contains(self, key: bytes) -> bool: ) raise ex - def __iter__(self) -> Iterator[KT]: - for k in self._iterkeys(): - yield k.decode() - def _clear(self) -> None: """This is typically used to clear data. From 3adc0f15856eabc49d4df3df93a18457a99e4e49 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Sep 2022 07:25:29 +0200 Subject: [PATCH 041/616] fixed wrong magic byte in offset key store --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f9cc06d66..7b8cd403b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -36,7 +36,6 @@ def __init__( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) self.table_name = table_name_generator(table) - self.offset_key_prefix = "changelog_offset:".encode() try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" @@ -65,6 +64,7 @@ def __init__( logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) + self.offset_key_prefix = "changelog_offset:" def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value @@ -184,13 +184,13 @@ def reset_state(self) -> None: ... def get_offset_key(self, tp: TP): - return self.offset_key_prefix + str(tp.partition).encode() + return self._encode_key(self.offset_key_prefix + str(tp.partition)) def persisted_offset(self, tp: TP) -> Optional[int]: """Return the last persisted offset. See :meth:`set_persisted_offset`. """ - offset_key = self.offset_key_prefix + str(tp.partition).encode() + offset_key = self.get_offset_key(tp) try: offset = self._get(offset_key) except KeyError: From 6cda4e2ba5116aa5471e09a342fed156fe2500bc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Sep 2022 07:56:47 +0200 Subject: [PATCH 042/616] added offset table --- faust/stores/bigtable.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7b8cd403b..9bb5eef7f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -49,8 +49,16 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) + self.bt_offset_table: Table = self.instance.tale("offsets") + self.column_family_id = "FaustColumnFamily" self.row_filter = CellsColumnLimitFilter(1) + if not self.bt_offset_table.exists(): + self.bt_offset_table.create( + column_families={ + self.column_family_id: column_family.MaxVersionsGCRule(1) + } + ) if not self.bt_table.exists(): self.bt_table.create( column_families={ @@ -64,7 +72,7 @@ def __init__( logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) - self.offset_key_prefix = "changelog_offset:" + self.offset_key_prefix = f"{self.table_name}_" def bigtable_extract_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value @@ -184,19 +192,17 @@ def reset_state(self) -> None: ... def get_offset_key(self, tp: TP): - return self._encode_key(self.offset_key_prefix + str(tp.partition)) + return (self.offset_key_prefix + str(tp.partition)) def persisted_offset(self, tp: TP) -> Optional[int]: """Return the last persisted offset. See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp) - try: - offset = self._get(offset_key) - except KeyError: - offset = None - if offset is not None: - return int(offset) + row_res = self.bt_offset_table.read_row(offset_key, filter_=self.row_filter) + if row_res is not None: + offset = int(self.bigtable_extract_row_data(row_res)) + return offset return None def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -208,7 +214,13 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ offset_key = self.get_offset_key(tp) - self._set(offset_key, str(offset).encode()) + row = self.bt_offset_table.direct_row(offset_key) + row.set_cell( + self.column_family_id, + self.column_name, + str(offset).encode(), + ) + row.commit() async def backup_partition( self, From b8069c693161ad55f3388050259b887192150bec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Sep 2022 08:17:32 +0200 Subject: [PATCH 043/616] fixed typo --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9bb5eef7f..a5920f061 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -49,7 +49,7 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) - self.bt_offset_table: Table = self.instance.tale("offsets") + self.bt_offset_table: Table = self.instance.table("offsets") self.column_family_id = "FaustColumnFamily" self.row_filter = CellsColumnLimitFilter(1) From 7005069ac7db8fc6e2eeef299e5cb0976df08e89 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Sep 2022 10:07:45 +0200 Subject: [PATCH 044/616] added a table per partition --- faust/stores/bigtable.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a5920f061..79f8b2bc0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,6 +2,7 @@ import logging import typing from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union +from faust.streams import current_event from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -14,6 +15,12 @@ from faust.types import TP, AppT, CollectionT, EventT +def get_current_partition(): + event = current_event() + assert event is not None + return event.message.partition + + class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -23,6 +30,7 @@ class BigTableStore(base.SerializedStore): PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" + BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" def __init__( self, @@ -35,7 +43,11 @@ def __init__( table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) + self.bt_start_key, self.bt_end_key = options.get( + BigTableStore.BT_READ_ROWS_BORDERS_KEY, [None, None] + ) self.table_name = table_name_generator(table) + self.table_name += f":{get_current_partition()}" try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" @@ -49,16 +61,8 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) - self.bt_offset_table: Table = self.instance.table("offsets") self.column_family_id = "FaustColumnFamily" - self.row_filter = CellsColumnLimitFilter(1) - if not self.bt_offset_table.exists(): - self.bt_offset_table.create( - column_families={ - self.column_family_id: column_family.MaxVersionsGCRule(1) - } - ) if not self.bt_table.exists(): self.bt_table.create( column_families={ @@ -146,7 +150,10 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - for row in self.bt_table.read_rows(): + for row in self.bt_table.read_rows( + start_key=self.bt_start_key, + end_key=self.bt_end_key, + ): yield ( row.row_key, self.bigtable_extract_row_data(row), @@ -192,14 +199,14 @@ def reset_state(self) -> None: ... def get_offset_key(self, tp: TP): - return (self.offset_key_prefix + str(tp.partition)) + return self.offset_key_prefix + str(tp.partition) def persisted_offset(self, tp: TP) -> Optional[int]: """Return the last persisted offset. See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp) - row_res = self.bt_offset_table.read_row(offset_key, filter_=self.row_filter) + row_res = self.bt_table.read_row(offset_key, filter_=self.row_filter) if row_res is not None: offset = int(self.bigtable_extract_row_data(row_res)) return offset @@ -214,7 +221,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ offset_key = self.get_offset_key(tp) - row = self.bt_offset_table.direct_row(offset_key) + row = self.bt_table.direct_row(offset_key) row.set_cell( self.column_family_id, self.column_name, From 1b51297364013c539cdc54e20cff7316551b5045 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Sep 2022 11:36:43 +0200 Subject: [PATCH 045/616] added partitioning --- faust/stores/bigtable.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 79f8b2bc0..eaf6ae6f7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -43,11 +43,11 @@ def __init__( table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) + self.table_name = table_name_generator(table) + self.bt_start_key, self.bt_end_key = options.get( - BigTableStore.BT_READ_ROWS_BORDERS_KEY, [None, None] + BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) - self.table_name = table_name_generator(table) - self.table_name += f":{get_current_partition()}" try: logging.getLogger(__name__).error( f"BigTableStore: Making bigtablestore with {self.table_name=}" @@ -61,7 +61,7 @@ def __init__( ) self.bt_table: Table = self.instance.table(self.table_name) - + self.row_filter = CellsColumnLimitFilter(1) self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): self.bt_table.create( @@ -83,6 +83,7 @@ def bigtable_extract_row_data(self, row_data): def _get(self, key: bytes) -> Optional[bytes]: try: + res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: self.log.warning(f"[Bigtable] KeyError in _get with {key=}") @@ -99,6 +100,8 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: + partition_prefix = get_current_partition().to_bytes(1, "little") + key = b"".join([partition_prefix, key]) row = self.bt_table.direct_row(key) row.set_cell( self.column_family_id, @@ -115,6 +118,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: + partition_prefix = get_current_partition().to_bytes(1, "little") + key = b"".join([partition_prefix, key]) row = self.bt_table.direct_row(key) row.delete() row.commit() @@ -150,12 +155,17 @@ def _itervalues(self) -> Iterator[bytes]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: + partition_prefix = get_current_partition().to_bytes(1, "little") + start_key = b"".join([partition_prefix, self.bt_start_key]) + end_key = b"".join([partition_prefix, self.bt_end_key]) + end_key = partition_prefix + self.bt_end_key + for row in self.bt_table.read_rows( - start_key=self.bt_start_key, - end_key=self.bt_end_key, + start_key=start_key, + end_key=end_key, ): yield ( - row.row_key, + row.row_key.replace(partition_prefix, b""), self.bigtable_extract_row_data(row), ) except Exception as ex: @@ -172,6 +182,8 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: + partition_prefix = get_current_partition().to_bytes(1, "little") + key = partition_prefix + key res = self.bt_table.read_row(key, filter_=self.row_filter) return res is not None except Exception as ex: @@ -285,7 +297,9 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - row = self.bt_table.direct_row(msg.key) + partition_bytes: bytes = tp.partition.to_bytes(1, "little") + offset_key = b"".join([partition_bytes, msg.key]) + row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() else: From 5f6fe475d9067aa4d3a44869f8c655caaaa4a982 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Sep 2022 11:40:45 +0200 Subject: [PATCH 046/616] fixed wrong naming --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index eaf6ae6f7..394c8ea5d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -227,7 +227,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. - This will remember the last offset that we wrote to RocksDB, + This will remember the last offset that we wrote to BigTableStore, so that on rebalance/recovery we can seek past this point to only read the events that occurred recently while we were not an active replica. @@ -280,7 +280,7 @@ def apply_changelog_batch( to_key: Callable[[Any], Any], to_value: Callable[[Any], Any], ) -> None: - """Write batch of changelog events to local RocksDB storage. + """Write batch of changelog events to local BigTableStore storage. Arguments: batch: Iterable of changelog events (:class:`faust.Event`) From 87ce9409dff7e8790f0bc89de7f9462b5c11e5c8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Sep 2022 13:27:05 +0200 Subject: [PATCH 047/616] fixed wrong get --- faust/stores/bigtable.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 394c8ea5d..4fdf77a22 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,6 +2,7 @@ import logging import typing from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union + from faust.streams import current_event from google.cloud.bigtable import column_family @@ -83,7 +84,8 @@ def bigtable_extract_row_data(self, row_data): def _get(self, key: bytes) -> Optional[bytes]: try: - + partition_prefix = get_current_partition().to_bytes(1, "little") + key = b"".join([partition_prefix, key]) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: self.log.warning(f"[Bigtable] KeyError in _get with {key=}") @@ -232,14 +234,21 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: to only read the events that occurred recently while we were not an active replica. """ - offset_key = self.get_offset_key(tp) - row = self.bt_table.direct_row(offset_key) - row.set_cell( - self.column_family_id, - self.column_name, - str(offset).encode(), - ) - row.commit() + try: + offset_key = self.get_offset_key(tp) + row = self.bt_table.direct_row(offset_key) + row.set_cell( + self.column_family_id, + self.column_name, + str(offset).encode(), + ) + row.commit() + except Exception as e: + self.log.error( + f"Failed to commit offset for {self.table.name}" + " -> will crash faust app!" + ) + self.app._crash(e) async def backup_partition( self, From c299926b742881d3071848afbb98d11282ac1ba9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 09:46:56 +0200 Subject: [PATCH 048/616] clean up --- faust/stores/bigtable.py | 156 +++++++++++++++++++++------------------ 1 file changed, 83 insertions(+), 73 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4fdf77a22..feb629025 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -31,7 +31,7 @@ class BigTableStore(base.SerializedStore): PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" + READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" def __init__( self, @@ -45,14 +45,12 @@ def __init__( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) self.table_name = table_name_generator(table) + self.offset_key_prefix = f"{self.table_name}_" self.bt_start_key, self.bt_end_key = options.get( - BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] + BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] ) try: - logging.getLogger(__name__).error( - f"BigTableStore: Making bigtablestore with {self.table_name=}" - ) self.client: Client = Client( options.get(BigTableStore.PROJECT_KEY), admin=True, @@ -60,37 +58,66 @@ def __init__( self.instance: Instance = self.client.instance( options.get(BigTableStore.INSTANCE_KEY) ) - - self.bt_table: Table = self.instance.table(self.table_name) self.row_filter = CellsColumnLimitFilter(1) - self.column_family_id = "FaustColumnFamily" - if not self.bt_table.exists(): - self.bt_table.create( - column_families={ - self.column_family_id: column_family.MaxVersionsGCRule(1) - } - ) self.column_name = "DATA" + self = self._bigtable_setup_table() table.use_partitioner = True except Exception as ex: logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) - self.offset_key_prefix = f"{self.table_name}_" - def bigtable_extract_row_data(self, row_data): + def _bigtable_setup_table(self): + self.bt_table: Table = self.instance.table(self.table_name) + self.column_family_id = "FaustColumnFamily" + if not self.bt_table.exists(): + logging.getLogger(__name__).info( + f"BigTableStore: Making new bigtablestore with {self.table_name=}" + ) + self.bt_table.create( + column_families={ + self.column_family_id: column_family.MaxVersionsGCRule(1) + } + ) + else: + logging.getLogger(__name__).info( + "BigTableStore: Using existing" f"bigtablestore with {self.table_name=}" + ) + + def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value + def _get_key_with_partition(self, key: bytes): + partition_prefix = get_current_partition().to_bytes(1, "little") + key = b"".join([partition_prefix, key]) + return key + + def _bigtbale_get(self, key: bytes): + res = self.bt_table.read_row(key, filter_=self.row_filter) + if res is None: + self.log.warning(f"[Bigtable] KeyError in _get with {key=}") + raise KeyError(f"row {key} not found in bigtable {self.table=}") + return self._bigtable_exrtact_row_data(res) + + def _bigtbale_set(self, key: bytes, value: Optional[bytes]): + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) + row.commit() + + def _bigtbale_del(self, key: bytes): + row = self.bt_table.direct_row(key) + row.delete() + row.commit() + def _get(self, key: bytes) -> Optional[bytes]: try: - partition_prefix = get_current_partition().to_bytes(1, "little") - key = b"".join([partition_prefix, key]) - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is None: - self.log.warning(f"[Bigtable] KeyError in _get with {key=}") - raise KeyError(f"row {key} not found in bigtable {self.table=}") - return self.bigtable_extract_row_data(res) + key = self._get_key_with_partition(key) + return self._bigtbale_get(key) except ValueError as ex: self.log.debug(f"key not found {key} exception {ex}") raise KeyError(f"key not found {key}") @@ -102,15 +129,8 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - partition_prefix = get_current_partition().to_bytes(1, "little") - key = b"".join([partition_prefix, key]) - row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - row.commit() + key = self._get_key_with_partition(key) + self._bigtbale_set(key, value) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -120,11 +140,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - partition_prefix = get_current_partition().to_bytes(1, "little") - key = b"".join([partition_prefix, key]) - row = self.bt_table.direct_row(key) - row.delete() - row.commit() + key = self._get_key_with_partition(key) + self._bigtbale_del(key) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -160,7 +177,6 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: partition_prefix = get_current_partition().to_bytes(1, "little") start_key = b"".join([partition_prefix, self.bt_start_key]) end_key = b"".join([partition_prefix, self.bt_end_key]) - end_key = partition_prefix + self.bt_end_key for row in self.bt_table.read_rows( start_key=start_key, @@ -168,7 +184,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: ): yield ( row.row_key.replace(partition_prefix, b""), - self.bigtable_extract_row_data(row), + self._bigtable_exrtact_row_data(row), ) except Exception as ex: self.log.error( @@ -222,7 +238,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: offset_key = self.get_offset_key(tp) row_res = self.bt_table.read_row(offset_key, filter_=self.row_filter) if row_res is not None: - offset = int(self.bigtable_extract_row_data(row_res)) + offset = int(self._bigtable_exrtact_row_data(row_res)) return offset return None @@ -235,14 +251,8 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ try: - offset_key = self.get_offset_key(tp) - row = self.bt_table.direct_row(offset_key) - row.set_cell( - self.column_family_id, - self.column_name, - str(offset).encode(), - ) - row.commit() + offset_key = self.get_offset_key(tp).encode() + self._bigtbale_set(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -250,30 +260,6 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: ) self.app._crash(e) - async def backup_partition( - self, - tp: Union[TP, int], - flush: bool = True, - purge: bool = False, - keep: int = 1, - ) -> None: - """Backup partition from this store. - - Not yet implemented for Bigtable. - - """ - raise NotImplementedError("Not yet implemented for Bigtable.") - - def restore_backup( - self, tp: Union[TP, int], latest: bool = True, backup_id: int = 0 - ) -> None: - """Restore partition backup from this store. - - Not yet implemented for Bigtable. - - """ - raise NotImplementedError("Not yet implemented for Bigtable.") - def _persist_changelog_batch(self, row_mutations, tp_offsets): response = self.bt_table.mutate_rows(row_mutations) for i, status in enumerate(response): @@ -306,7 +292,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - partition_bytes: bytes = tp.partition.to_bytes(1, "little") + partition_bytes = tp.partition.to_bytes(1, "little") offset_key = b"".join([partition_bytes, msg.key]) row = self.bt_table.direct_row(offset_key) if msg.value is None: @@ -322,3 +308,27 @@ def apply_changelog_batch( row_mutations, tp_offsets, ) + + async def backup_partition( + self, + tp: Union[TP, int], + flush: bool = True, + purge: bool = False, + keep: int = 1, + ) -> None: + """Backup partition from this store. + + Not yet implemented for Bigtable. + + """ + raise NotImplementedError("Not yet implemented for Bigtable.") + + def restore_backup( + self, tp: Union[TP, int], latest: bool = True, backup_id: int = 0 + ) -> None: + """Restore partition backup from this store. + + Not yet implemented for Bigtable. + + """ + raise NotImplementedError("Not yet implemented for Bigtable.") From 356c12cbaa5d5c5f98ad0edf2b45a2b1e4f6c737 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 13:18:06 +0200 Subject: [PATCH 049/616] added logging temp --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index feb629025..56ad36cb6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -66,6 +66,7 @@ def __init__( except Exception as ex: logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex + logging.getLogger(__name__).error(f"Kwargs for super are: {kwargs}") super().__init__(url, app, table, **kwargs) def _bigtable_setup_table(self): @@ -292,8 +293,9 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message + key: bytes = msg.key partition_bytes = tp.partition.to_bytes(1, "little") - offset_key = b"".join([partition_bytes, msg.key]) + offset_key = b"".join([partition_bytes, key]) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() From 59d68c41ad7d1c37d10797a06e0a2d49479f60d9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 14:49:57 +0200 Subject: [PATCH 050/616] removed logging again --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 56ad36cb6..bd1e670b8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -66,7 +66,6 @@ def __init__( except Exception as ex: logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex - logging.getLogger(__name__).error(f"Kwargs for super are: {kwargs}") super().__init__(url, app, table, **kwargs) def _bigtable_setup_table(self): From df6b5f1f9b286bc69436831db72bba4c4bec6dbb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 15:10:16 +0200 Subject: [PATCH 051/616] fixed wrong parameter --- faust/stores/bigtable.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bd1e670b8..ef200c773 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -44,8 +44,8 @@ def __init__( table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) - self.table_name = table_name_generator(table) - self.offset_key_prefix = f"{self.table_name}_" + self.bt_table_name = table_name_generator(table) + self.offset_key_prefix = f"{self.bt_table_name}_" self.bt_start_key, self.bt_end_key = options.get( BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] @@ -64,16 +64,15 @@ def __init__( table.use_partitioner = True except Exception as ex: - logging.getLogger(__name__).error(f"Error configuring bigtable client {ex}") raise ex super().__init__(url, app, table, **kwargs) def _bigtable_setup_table(self): - self.bt_table: Table = self.instance.table(self.table_name) + self.bt_table: Table = self.instance.table(self.bt_table_name) self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): logging.getLogger(__name__).info( - f"BigTableStore: Making new bigtablestore with {self.table_name=}" + f"BigTableStore: Making new bigtablestore with {self.bt_table_name=}" ) self.bt_table.create( column_families={ @@ -82,7 +81,7 @@ def _bigtable_setup_table(self): ) else: logging.getLogger(__name__).info( - "BigTableStore: Using existing" f"bigtablestore with {self.table_name=}" + "BigTableStore: Using existing" f"bigtablestore with {self.bt_table_name=}" ) def _bigtable_exrtact_row_data(self, row_data): From 052172e2189e193943558e4436393ea448f27a8d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 15:16:23 +0200 Subject: [PATCH 052/616] formatting --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ef200c773..328f7e971 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -81,7 +81,8 @@ def _bigtable_setup_table(self): ) else: logging.getLogger(__name__).info( - "BigTableStore: Using existing" f"bigtablestore with {self.bt_table_name=}" + "BigTableStore: Using existing" + f"bigtablestore with {self.bt_table_name=}" ) def _bigtable_exrtact_row_data(self, row_data): From a188986f71760f9770088124be8ddc91e79a279d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 15:42:38 +0200 Subject: [PATCH 053/616] added logging --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 328f7e971..9a7f40cc9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -64,6 +64,7 @@ def __init__( table.use_partitioner = True except Exception as ex: + logger.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) From 80ac87e67b3a9f2ab2a1fadfa85ee738305706f4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 16:00:32 +0200 Subject: [PATCH 054/616] removed assignment to self --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9a7f40cc9..cf4fdd6c1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -60,11 +60,11 @@ def __init__( ) self.row_filter = CellsColumnLimitFilter(1) self.column_name = "DATA" - self = self._bigtable_setup_table() + self._bigtable_setup_table() table.use_partitioner = True except Exception as ex: - logger.getLogger(__name__).error(f"Error in Bigtable init {ex}") + logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) From ccaf339222ec0934662a921f8d523eda26d75075 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Sep 2022 16:32:27 +0200 Subject: [PATCH 055/616] made offset key prefix static --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cf4fdd6c1..186c06c90 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -45,7 +45,6 @@ def __init__( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) self.bt_table_name = table_name_generator(table) - self.offset_key_prefix = f"{self.bt_table_name}_" self.bt_start_key, self.bt_end_key = options.get( BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] @@ -66,6 +65,7 @@ def __init__( except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex + self.offset_key_prefix = f"offset_partitiion:" super().__init__(url, app, table, **kwargs) def _bigtable_setup_table(self): From 3a32f8769462254a1aa52205b8376e9ec9286679 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 08:30:19 +0200 Subject: [PATCH 056/616] added key_cache for delete and garbage collection --- faust/stores/bigtable.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 186c06c90..5b5b917db 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -3,6 +3,8 @@ import typing from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union +from mode.utils.collections import LRUCache + from faust.streams import current_event from google.cloud.bigtable import column_family @@ -28,6 +30,8 @@ class BigTableStore(base.SerializedStore): client: Client instance: Instance bt_table: Table + + _key_index: LRUCache[bytes, int] PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" @@ -41,6 +45,7 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: + self._key_index = LRUCache(limit=app.conf.table_key_index_size) table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) @@ -89,8 +94,11 @@ def _bigtable_setup_table(self): def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value - def _get_key_with_partition(self, key: bytes): - partition_prefix = get_current_partition().to_bytes(1, "little") + def _get_key_with_partition(self, key: bytes, partition: Optional[int] = None): + if partition: + partition_prefix = partition.to_bytes(1, "little") + else: + partition_prefix = get_current_partition().to_bytes(1, "little") key = b"".join([partition_prefix, key]) return key @@ -110,6 +118,14 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): ) row.commit() + def _partitions_for_key(self, key: bytes) -> Iterable[int]: + # Returns cached db if key is in index, otherwise all dbs + # for linear search. + try: + return self._key_index[key] + except KeyError: + return range(self.table.changelog_topic.partitions) + def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() @@ -130,7 +146,10 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - key = self._get_key_with_partition(key) + partition = get_current_partition() + self._key_index[key] = partition + key = self._get_key_with_partition(key, partition=partition) + self._bigtbale_set(key, value) except Exception as ex: self.log.error( @@ -141,8 +160,9 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - key = self._get_key_with_partition(key) - self._bigtbale_del(key) + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition(key, partition=partition) + self._bigtbale_del(key) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -201,10 +221,12 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - partition_prefix = get_current_partition().to_bytes(1, "little") - key = partition_prefix + key - res = self.bt_table.read_row(key, filter_=self.row_filter) - return res is not None + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition(key, partition=partition) + res = self.bt_table.read_row(key, filter_=self.row_filter) + if res is not None: + return True + return False except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From 6405405299dc536683e1941e41181d56c521299f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 09:44:33 +0200 Subject: [PATCH 057/616] better partitioning for methods that don't have a current event --- faust/stores/bigtable.py | 58 +++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5b5b917db..bd63b6373 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -70,7 +70,7 @@ def __init__( except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex - self.offset_key_prefix = f"offset_partitiion:" + self.offset_key_prefix = "offset_partitiion:" super().__init__(url, app, table, **kwargs) def _bigtable_setup_table(self): @@ -105,8 +105,7 @@ def _get_key_with_partition(self, key: bytes, partition: Optional[int] = None): def _bigtbale_get(self, key: bytes): res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: - self.log.warning(f"[Bigtable] KeyError in _get with {key=}") - raise KeyError(f"row {key} not found in bigtable {self.table=}") + return None return self._bigtable_exrtact_row_data(res) def _bigtbale_set(self, key: bytes, value: Optional[bytes]): @@ -122,7 +121,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: # Returns cached db if key is in index, otherwise all dbs # for linear search. try: - return self._key_index[key] + return [self._key_index[key]] except KeyError: return range(self.table.changelog_topic.partitions) @@ -133,11 +132,17 @@ def _bigtbale_del(self, key: bytes): def _get(self, key: bytes) -> Optional[bytes]: try: - key = self._get_key_with_partition(key) - return self._bigtbale_get(key) - except ValueError as ex: - self.log.debug(f"key not found {key} exception {ex}") - raise KeyError(f"key not found {key}") + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_key_with_partition(key) + value = self._bigtbale_get(key_with_partition) + if value is not None: + self._key_index[key] = partition + return value + raise KeyError + except KeyError as ke: + self.log.error(f"KeyError in get for table {self.table_name} for {key=}") + raise ke except Exception as ex: self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" @@ -193,20 +198,31 @@ def _itervalues(self) -> Iterator[bytes]: ) raise ex + def _active_partitions(self) -> Iterator[int]: + actives = self.app.assignor.assigned_actives() + topic = self.table.changelog_topic_name + for partition in range(self.table.partitions): + tp = TP(topic=topic, partition=partition) + # for global tables, keys from all + # partitions are available. + if tp in actives or self.table.is_global: + yield partition + def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - partition_prefix = get_current_partition().to_bytes(1, "little") - start_key = b"".join([partition_prefix, self.bt_start_key]) - end_key = b"".join([partition_prefix, self.bt_end_key]) - - for row in self.bt_table.read_rows( - start_key=start_key, - end_key=end_key, - ): - yield ( - row.row_key.replace(partition_prefix, b""), - self._bigtable_exrtact_row_data(row), - ) + for partition in self._active_partitions(): + partition_prefix = partition.to_bytes(1, "little") + start_key = b"".join([partition_prefix, self.bt_start_key]) + end_key = b"".join([partition_prefix, self.bt_end_key]) + + for row in self.bt_table.read_rows( + start_key=start_key, + end_key=end_key, + ): + yield ( + row.row_key.replace(partition_prefix, b""), + self._bigtable_exrtact_row_data(row), + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " From ad0b44ced3efdf1463562f69551ac8ae5c6496c3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 10:13:33 +0200 Subject: [PATCH 058/616] take partitions from topic partitions --- faust/stores/bigtable.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bd63b6373..8f7d26dd3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -123,7 +123,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return range(self.table.changelog_topic.partitions) + return range(self.app.conf.topic_partitions) def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) @@ -201,7 +201,7 @@ def _itervalues(self) -> Iterator[bytes]: def _active_partitions(self) -> Iterator[int]: actives = self.app.assignor.assigned_actives() topic = self.table.changelog_topic_name - for partition in range(self.table.partitions): + for partition in range(self.app.conf.topic_partitions): tp = TP(topic=topic, partition=partition) # for global tables, keys from all # partitions are available. @@ -237,11 +237,19 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) + event = current_event() + partition_from_message = ( + event is not None + and not self.table.is_global + and not self.table.use_partitioner + ) + if partition_from_message: + key = self._get_key_with_partition(key, partition=event.message.partition) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True + else: + for partition in self._partitions_for_key(key): return False except Exception as ex: self.log.error( From 9e04efdd56924484e9a625073688821073772991 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 10:28:15 +0200 Subject: [PATCH 059/616] removed partitioner from contains --- faust/stores/bigtable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8f7d26dd3..f08e2d536 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -241,7 +241,6 @@ def _contains(self, key: bytes) -> bool: partition_from_message = ( event is not None and not self.table.is_global - and not self.table.use_partitioner ) if partition_from_message: key = self._get_key_with_partition(key, partition=event.message.partition) @@ -250,6 +249,10 @@ def _contains(self, key: bytes) -> bool: return True else: for partition in self._partitions_for_key(key): + key = self._get_key_with_partition(key, partition=partition) + res = self.bt_table.read_row(key, filter_=self.row_filter) + if res is not None: + return True return False except Exception as ex: self.log.error( From b65a97a3f38987656405954bd11853c96bccc6a1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 11:09:59 +0200 Subject: [PATCH 060/616] fixed wrong if clause --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f08e2d536..ea42eaa56 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,5 +1,6 @@ """BigTable storage.""" import logging +from os import wait import typing from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union @@ -95,7 +96,7 @@ def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def _get_key_with_partition(self, key: bytes, partition: Optional[int] = None): - if partition: + if partition is not None: partition_prefix = partition.to_bytes(1, "little") else: partition_prefix = get_current_partition().to_bytes(1, "little") From b9b755995cf403f996cbf26fe98ca4272104dccc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 11:34:25 +0200 Subject: [PATCH 061/616] added logging --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ea42eaa56..2bb4bf63a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -140,6 +140,8 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: self._key_index[key] = partition return value + else: + self.log.info(f"Key {key_with_partition} not in {self.table_name}") raise KeyError except KeyError as ke: self.log.error(f"KeyError in get for table {self.table_name} for {key=}") From c384c2b13fb1aadd77aa07b50616f2d6da0eb228 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 11:42:32 +0200 Subject: [PATCH 062/616] return different iterable for linear search --- faust/stores/bigtable.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2bb4bf63a..31e0ebec9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,5 @@ """BigTable storage.""" import logging -from os import wait import typing from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union @@ -119,12 +118,10 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): row.commit() def _partitions_for_key(self, key: bytes) -> Iterable[int]: - # Returns cached db if key is in index, otherwise all dbs - # for linear search. try: return [self._key_index[key]] except KeyError: - return range(self.app.conf.topic_partitions) + return [x for x in range(self.app.conf.topic_partitions)] def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) @@ -155,10 +152,9 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() + key_with_partition = self._get_key_with_partition(key, partition=partition) + self._bigtbale_set(key_with_partition, value) self._key_index[key] = partition - key = self._get_key_with_partition(key, partition=partition) - - self._bigtbale_set(key, value) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " From c1c887daf0fb3abb43cd235f6f3d6daa622acc86 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 12:01:48 +0200 Subject: [PATCH 063/616] fixed get --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 31e0ebec9..01872be61 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -121,7 +121,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return [x for x in range(self.app.conf.topic_partitions)] + return range(self.app.conf.topic_partitions) def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) @@ -131,8 +131,7 @@ def _bigtbale_del(self, key: bytes): def _get(self, key: bytes) -> Optional[bytes]: try: for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) - key_with_partition = self._get_key_with_partition(key) + key_with_partition = self._get_key_with_partition(key, partition=partition) value = self._bigtbale_get(key_with_partition) if value is not None: self._key_index[key] = partition From a7833d653a429ea1ed3517663003faf45fe88de7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 12:02:05 +0200 Subject: [PATCH 064/616] fixed logging --- faust/stores/bigtable.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 01872be61..537f490c1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -131,13 +131,13 @@ def _bigtbale_del(self, key: bytes): def _get(self, key: bytes) -> Optional[bytes]: try: for partition in self._partitions_for_key(key): - key_with_partition = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) value = self._bigtbale_get(key_with_partition) if value is not None: self._key_index[key] = partition return value - else: - self.log.info(f"Key {key_with_partition} not in {self.table_name}") raise KeyError except KeyError as ke: self.log.error(f"KeyError in get for table {self.table_name} for {key=}") @@ -236,12 +236,11 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: event = current_event() - partition_from_message = ( - event is not None - and not self.table.is_global - ) + partition_from_message = event is not None and not self.table.is_global if partition_from_message: - key = self._get_key_with_partition(key, partition=event.message.partition) + key = self._get_key_with_partition( + key, partition=event.message.partition + ) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True From 429aa23dd1b16f09ce80e2a937a098fe1c04252c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 12:03:38 +0200 Subject: [PATCH 065/616] fixed _del --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 537f490c1..7059d7fea 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -164,8 +164,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) - self._bigtbale_del(key) + key_with_partition = self._get_key_with_partition(key, partition=partition) + self._bigtbale_del(key_with_partition) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " From 5c72a615d459fd117d4ddbb7041cc167dd830d5c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 12:26:54 +0200 Subject: [PATCH 066/616] fixed _iterkeys --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7059d7fea..b74a6244e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -218,7 +218,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: end_key=end_key, ): yield ( - row.row_key.replace(partition_prefix, b""), + row.row_key[1:], self._bigtable_exrtact_row_data(row), ) except Exception as ex: From 34aa064a7b8db825245a855305405a3adaef1a09 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Sep 2022 15:42:06 +0200 Subject: [PATCH 067/616] added logging for iterkeys --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b74a6244e..6f9439883 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -175,8 +175,10 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: + self.log.info(f"Started _iterkeys for {self.table_name}") for row in self._iteritems(): yield row[0] + self.log.info(f"Finished _iterkeys for {self.table_name}") except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " From 233f237a6157aecacfecc2deba27ab93b109d73b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Sep 2022 10:51:27 +0200 Subject: [PATCH 068/616] removed partitioner and made contains little faster --- faust/stores/bigtable.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6f9439883..f38b41d3a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -65,8 +65,6 @@ def __init__( self.row_filter = CellsColumnLimitFilter(1) self.column_name = "DATA" self._bigtable_setup_table() - - table.use_partitioner = True except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -164,8 +162,12 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: for partition in self._partitions_for_key(key): - key_with_partition = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) self._bigtbale_del(key_with_partition) + if key in self._key_index: + del self._key_index[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -237,6 +239,9 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: + if key in self._key_index: + return True + event = current_event() partition_from_message = event is not None and not self.table.is_global if partition_from_message: From c114455f286e9d9c6bb865f28ab5076ba43f3879 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Sep 2022 13:07:01 +0200 Subject: [PATCH 069/616] maybe faster get requests (should reduce get time by factor 20) --- faust/stores/bigtable.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f38b41d3a..b5ebf3949 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -127,15 +127,30 @@ def _bigtbale_del(self, key: bytes): row.commit() def _get(self, key: bytes) -> Optional[bytes]: + event = current_event() + partition_from_message = ( + event is not None + and not self.table.is_global + and not self.table.use_partitioner + ) try: - for partition in self._partitions_for_key(key): + if partition_from_message: key_with_partition = self._get_key_with_partition( - key, partition=partition + key, partition=event.message.partition ) value = self._bigtbale_get(key_with_partition) if value is not None: - self._key_index[key] = partition + self._key_index[key] = event.message.partition return value + else: + for partition in self._partitions_for_key(key): + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) + value = self._bigtbale_get(key_with_partition) + if value is not None: + self._key_index[key] = partition + return value raise KeyError except KeyError as ke: self.log.error(f"KeyError in get for table {self.table_name} for {key=}") From 881faf1dd2a0742f391716fd1d9c55fb392c6c90 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Sep 2022 13:17:03 +0200 Subject: [PATCH 070/616] refactored get partition --- faust/stores/bigtable.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b5ebf3949..808e61d63 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -92,11 +92,8 @@ def _bigtable_setup_table(self): def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value - def _get_key_with_partition(self, key: bytes, partition: Optional[int] = None): - if partition is not None: - partition_prefix = partition.to_bytes(1, "little") - else: - partition_prefix = get_current_partition().to_bytes(1, "little") + def _get_key_with_partition(self, key: bytes, partition): + partition_prefix = partition.to_bytes(1, "little") key = b"".join([partition_prefix, key]) return key @@ -126,21 +123,27 @@ def _bigtbale_del(self, key: bytes): row.delete() row.commit() - def _get(self, key: bytes) -> Optional[bytes]: + def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() - partition_from_message = ( + if ( event is not None and not self.table.is_global and not self.table.use_partitioner - ) + ): + return event.message.partition + else: + return None + + def _get(self, key: bytes) -> Optional[bytes]: try: - if partition_from_message: + partition = self._maybe_get_partition_from_message() + if partition is not None: key_with_partition = self._get_key_with_partition( - key, partition=event.message.partition + key, partition=partition ) value = self._bigtbale_get(key_with_partition) if value is not None: - self._key_index[key] = event.message.partition + self._key_index[key] = partition return value else: for partition in self._partitions_for_key(key): @@ -257,11 +260,10 @@ def _contains(self, key: bytes) -> bool: if key in self._key_index: return True - event = current_event() - partition_from_message = event is not None and not self.table.is_global - if partition_from_message: + partition = self._maybe_get_partition_from_message() + if partition is not None: key = self._get_key_with_partition( - key, partition=event.message.partition + key, partition=partition, ) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: From bed7ee7ddcb25382c247e9c720373e4f922c7fa0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Sep 2022 14:23:49 +0200 Subject: [PATCH 071/616] test_contains cache --- faust/stores/bigtable.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 808e61d63..6c86ddaf1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -45,6 +45,9 @@ def __init__( options: typing.Dict[str, Any], **kwargs: Any, ) -> None: + + # if we ask if a key exist, we may need it soon. + self._contains_cache = LRUCache(limit=10000) self._key_index = LRUCache(limit=app.conf.table_key_index_size) table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name @@ -92,11 +95,6 @@ def _bigtable_setup_table(self): def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value - def _get_key_with_partition(self, key: bytes, partition): - partition_prefix = partition.to_bytes(1, "little") - key = b"".join([partition_prefix, key]) - return key - def _bigtbale_get(self, key: bytes): res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: @@ -134,7 +132,16 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: else: return None + def _get_key_with_partition(self, key: bytes, partition): + partition_prefix = partition.to_bytes(1, "little") + key = b"".join([partition_prefix, key]) + return key + def _get(self, key: bytes) -> Optional[bytes]: + if key in self._contains_cache: + val = self._contains_cache[key] + return val + try: partition = self._maybe_get_partition_from_message() if partition is not None: @@ -170,6 +177,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: key_with_partition = self._get_key_with_partition(key, partition=partition) self._bigtbale_set(key_with_partition, value) self._key_index[key] = partition + if key in self._contains_cache: + del self._contains_cache[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -186,6 +195,8 @@ def _del(self, key: bytes) -> None: self._bigtbale_del(key_with_partition) if key in self._key_index: del self._key_index[key] + if key in self._contains_cache: + del self._contains_cache[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -267,6 +278,7 @@ def _contains(self, key: bytes) -> bool: ) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: + self._contains_cache[key] = self._bigtable_exrtact_row_data(res) return True else: for partition in self._partitions_for_key(key): From 574bd0f8f3308ba2ff648f046eec1ce1791b25f0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 10:08:37 +0200 Subject: [PATCH 072/616] added value caching and refactored init --- faust/stores/bigtable.py | 142 +++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 43 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6c86ddaf1..4ce2eeed7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,7 +1,6 @@ """BigTable storage.""" import logging -import typing -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union from mode.utils.collections import LRUCache @@ -24,6 +23,30 @@ def get_current_partition(): return event.message.partition +class BigtableStartupCache(dict): + """ + This is a dictionary which is only filled once, after that, every + successful access to a key, will remove it. + """ + + def __getitem__(self, key): + value = super().__getitem__(key) + del self[key] + return value + + def __setitem__(self, key, _) -> None: + if key in self.keys(): + del self[key] + + def __delitem__(self, key): + if key in self.keys(): + super().__delitem__(key) + + def fill(self, iter: Iterator[Tuple[bytes, bytes]]) -> None: + for k, v in iter: + super().__setitem__(k, v) + + class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -32,49 +55,81 @@ class BigTableStore(base.SerializedStore): bt_table: Table _key_index: LRUCache[bytes, int] + _cache: Optional[Union[LRUCache[bytes, bytes], Dict[bytes, bytes]]] PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" + VALUE_CACHE_TYPE_KEY = "value_cache_type_key" + VALUE_CACHE_SIZE_KEY = "value_cache_size_key" + BT_COLUMN_NAME_KEY = "bt_column_name_key" + BT_ROW_FILTER_KEY = "bt_row_filter_key" + BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" def __init__( self, url: Union[str, URL], app: AppT, table: CollectionT, - options: typing.Dict[str, Any], + options: Dict[str, Any], **kwargs: Any, ) -> None: - - # if we ask if a key exist, we may need it soon. - self._contains_cache = LRUCache(limit=10000) - self._key_index = LRUCache(limit=app.conf.table_key_index_size) - table_name_generator = options.get( - BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name - ) - self.bt_table_name = table_name_generator(table) - - self.bt_start_key, self.bt_end_key = options.get( - BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] - ) + self._set_options(options) try: - self.client: Client = Client( - options.get(BigTableStore.PROJECT_KEY), - admin=True, - ) - self.instance: Instance = self.client.instance( - options.get(BigTableStore.INSTANCE_KEY) - ) - self.row_filter = CellsColumnLimitFilter(1) - self.column_name = "DATA" - self._bigtable_setup_table() + self._bigtable_setup(table, options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex - self.offset_key_prefix = "offset_partitiion:" super().__init__(url, app, table, **kwargs) - def _bigtable_setup_table(self): + def _set_options(self, options) -> None: + self.table_name_generator = options.get( + BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name + ) + self.bt_start_key, self.bt_end_key = options.get( + BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] + ) + self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) + self.value_cache_size = options.get( + BigTableStore.VALUE_CACHE_SIZE_KEY, self.app.conf.table_key_index_size + ) + self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") + self.row_filter = options.get( + BigTableStore.BT_ROW_FILTER_KEY, CellsColumnLimitFilter(1) + ) + self.offset_key_prefix = options.get( + BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" + ) + + def _setup_value_cache(self) -> None: + if self.value_cache_type == "startup": + self.log.info("Setting up BigtableStartupCache") + self._cache = BigtableStartupCache() + self._cache.fill(self._iteritems()) + self.log.info("Finished setup of BigtableStartupCache") + elif self.value_cache_type == "forever": + self._cache = LRUCache(limit=self.value_cache_size) + elif self.value_cache_type is None: + self._cache = None + else: + raise NotImplemented(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") + + async def on_recovery_completed( + self, active_tps: Set[TP], standby_tps: Set[TP] + ) -> None: + self._setup_value_cache() + self._key_index = LRUCache(limit=self.app.conf.table_key_index_size) + return await super().on_recovery_completed(active_tps, standby_tps) + + def _bigtable_setup(self, table, options: Dict[str, Any]): + self.bt_table_name = self.table_name_generator(table) + self.client: Client = Client( + options.get(BigTableStore.PROJECT_KEY), + admin=True, + ) + self.instance: Instance = self.client.instance( + options.get(BigTableStore.INSTANCE_KEY) + ) self.bt_table: Table = self.instance.table(self.bt_table_name) self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): @@ -110,12 +165,6 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): ) row.commit() - def _partitions_for_key(self, key: bytes) -> Iterable[int]: - try: - return [self._key_index[key]] - except KeyError: - return range(self.app.conf.topic_partitions) - def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() @@ -137,11 +186,16 @@ def _get_key_with_partition(self, key: bytes, partition): key = b"".join([partition_prefix, key]) return key - def _get(self, key: bytes) -> Optional[bytes]: - if key in self._contains_cache: - val = self._contains_cache[key] - return val + def _partitions_for_key(self, key: bytes) -> Iterable[int]: + try: + return [self._key_index[key]] + except KeyError: + return range(self.app.conf.topic_partitions) + def _get(self, key: bytes) -> Optional[bytes]: + if self._cache is not None: + if key in self._cache.keys(): + return self._cache[key] try: partition = self._maybe_get_partition_from_message() if partition is not None: @@ -176,9 +230,9 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: partition = get_current_partition() key_with_partition = self._get_key_with_partition(key, partition=partition) self._bigtbale_set(key_with_partition, value) + if self._cache is not None: + self._cache[key] = value self._key_index[key] = partition - if key in self._contains_cache: - del self._contains_cache[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -193,10 +247,12 @@ def _del(self, key: bytes) -> None: key, partition=partition ) self._bigtbale_del(key_with_partition) + + if self._cache is not None: + if key in self._cache: + del self._cache[key] if key in self._key_index: del self._key_index[key] - if key in self._contains_cache: - del self._contains_cache[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -274,11 +330,11 @@ def _contains(self, key: bytes) -> bool: partition = self._maybe_get_partition_from_message() if partition is not None: key = self._get_key_with_partition( - key, partition=partition, + key, + partition=partition, ) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: - self._contains_cache[key] = self._bigtable_exrtact_row_data(res) return True else: for partition in self._partitions_for_key(key): From 168834217a5efbcc4112dc7f60b79ee8311bc56a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 10:56:10 +0200 Subject: [PATCH 073/616] added app to set options --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4ce2eeed7..9b0526ec0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -74,7 +74,7 @@ def __init__( options: Dict[str, Any], **kwargs: Any, ) -> None: - self._set_options(options) + self._set_options(app, options) try: self._bigtable_setup(table, options) except Exception as ex: @@ -82,7 +82,7 @@ def __init__( raise ex super().__init__(url, app, table, **kwargs) - def _set_options(self, options) -> None: + def _set_options(self, app, options) -> None: self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) @@ -91,7 +91,7 @@ def _set_options(self, options) -> None: ) self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( - BigTableStore.VALUE_CACHE_SIZE_KEY, self.app.conf.table_key_index_size + BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size ) self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") self.row_filter = options.get( From c3ca5947804812f94311eceaeeaac48ada36931d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 11:32:03 +0200 Subject: [PATCH 074/616] moved init of key_cache out of on_recover_finished --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9b0526ec0..b61141793 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -80,6 +80,8 @@ def __init__( except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex + self._key_index = LRUCache(limit=app.conf.table_key_index_size) + self._cache = None super().__init__(url, app, table, **kwargs) def _set_options(self, app, options) -> None: @@ -118,7 +120,6 @@ async def on_recovery_completed( self, active_tps: Set[TP], standby_tps: Set[TP] ) -> None: self._setup_value_cache() - self._key_index = LRUCache(limit=self.app.conf.table_key_index_size) return await super().on_recovery_completed(active_tps, standby_tps) def _bigtable_setup(self, table, options: Dict[str, Any]): From 95faa012d82eb81ff55a372f879b3f4d47b116bd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 11:34:58 +0200 Subject: [PATCH 075/616] removed on_recover_finished --- faust/stores/bigtable.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b61141793..e7a9f4ba3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -81,7 +81,7 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex self._key_index = LRUCache(limit=app.conf.table_key_index_size) - self._cache = None + self._setup_value_cache() super().__init__(url, app, table, **kwargs) def _set_options(self, app, options) -> None: @@ -116,12 +116,6 @@ def _setup_value_cache(self) -> None: else: raise NotImplemented(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") - async def on_recovery_completed( - self, active_tps: Set[TP], standby_tps: Set[TP] - ) -> None: - self._setup_value_cache() - return await super().on_recovery_completed(active_tps, standby_tps) - def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: Client = Client( From 5e2a8e0d8806efce9997eca8da8966841f4ddc7d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 11:37:52 +0200 Subject: [PATCH 076/616] moved setup of value cache after super init --- faust/stores/bigtable.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e7a9f4ba3..f6fac721c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,18 +2,16 @@ import logging from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union -from mode.utils.collections import LRUCache - -from faust.streams import current_event - from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.table import Table +from mode.utils.collections import LRUCache from yarl import URL from faust.stores import base +from faust.streams import current_event from faust.types import TP, AppT, CollectionT, EventT @@ -75,14 +73,14 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(app, options) + self._key_index = LRUCache(limit=app.conf.table_key_index_size) try: self._bigtable_setup(table, options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex - self._key_index = LRUCache(limit=app.conf.table_key_index_size) - self._setup_value_cache() super().__init__(url, app, table, **kwargs) + self._setup_value_cache() def _set_options(self, app, options) -> None: self.table_name_generator = options.get( @@ -114,7 +112,7 @@ def _setup_value_cache(self) -> None: elif self.value_cache_type is None: self._cache = None else: - raise NotImplemented(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") + raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) From bdf3a06ee07adcba6e87d87a2a71a6116a53d93f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 11:58:44 +0200 Subject: [PATCH 077/616] added logging for cache --- faust/stores/bigtable.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f6fac721c..0e794d628 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -80,7 +80,12 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) + + async def on_recovery_completed( + self, active_tps: Set[TP], standby_tps: Set[TP] + ) -> None: self._setup_value_cache() + return await super().on_recovery_completed(active_tps, standby_tps) def _set_options(self, app, options) -> None: self.table_name_generator = options.get( @@ -105,8 +110,12 @@ def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": self.log.info("Setting up BigtableStartupCache") self._cache = BigtableStartupCache() + self.log.info("Start filling satrtup cache") self._cache.fill(self._iteritems()) - self.log.info("Finished setup of BigtableStartupCache") + self.log.info( + "Finished setup of BigtableStartupCache. " + f"Has {len(self._cache)} entries" + ) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) elif self.value_cache_type is None: @@ -188,6 +197,10 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: if self._cache is not None: if key in self._cache.keys(): + self.log.info( + f"Took value from {key=} from cache, " + f"cachesize={len(self._cache)}" + ) return self._cache[key] try: partition = self._maybe_get_partition_from_message() From 9b36957fe21ce8ec9ab1f177af82ac5023ad9f79 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 12:13:11 +0200 Subject: [PATCH 078/616] move dsetup of cache to end of init --- faust/stores/bigtable.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0e794d628..dec7232a1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -80,12 +80,7 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) - - async def on_recovery_completed( - self, active_tps: Set[TP], standby_tps: Set[TP] - ) -> None: self._setup_value_cache() - return await super().on_recovery_completed(active_tps, standby_tps) def _set_options(self, app, options) -> None: self.table_name_generator = options.get( From 8b2d9477d6924f48503ebe7b0a9f0b48b0a33bd7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 12:43:29 +0200 Subject: [PATCH 079/616] correct naming for read rows option --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index dec7232a1..4010b108e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -57,7 +57,7 @@ class BigTableStore(base.SerializedStore): PROJECT_KEY = "project_key" INSTANCE_KEY = "instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" + BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" BT_COLUMN_NAME_KEY = "bt_column_name_key" @@ -87,7 +87,7 @@ def _set_options(self, app, options) -> None: BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) self.bt_start_key, self.bt_end_key = options.get( - BigTableStore.READ_ROWS_BORDERS_KEY, [b"", b""] + BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( From b2d0674b8b9e91bd07113de92625afb5d9fc7fd8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 13:23:01 +0200 Subject: [PATCH 080/616] better delete for bigtable-cache --- faust/stores/bigtable.py | 46 +++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4010b108e..fcdaa9dcf 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -21,28 +21,31 @@ def get_current_partition(): return event.message.partition -class BigtableStartupCache(dict): +class BigtableStartupCache(): """ This is a dictionary which is only filled once, after that, every successful access to a key, will remove it. """ + data: Dict = {} + + def __len__(self): + return len(self.data) def __getitem__(self, key): - value = super().__getitem__(key) - del self[key] + value = self.data.pop(key) return value def __setitem__(self, key, _) -> None: - if key in self.keys(): - del self[key] + if key in self.data.keys(): + self.data.pop(key, None) def __delitem__(self, key): - if key in self.keys(): - super().__delitem__(key) + if key in self.data.keys(): + self.data.pop(key, None) def fill(self, iter: Iterator[Tuple[bytes, bytes]]) -> None: for k, v in iter: - super().__setitem__(k, v) + self.data[k] = v class BigTableStore(base.SerializedStore): @@ -54,14 +57,14 @@ class BigTableStore(base.SerializedStore): _key_index: LRUCache[bytes, int] _cache: Optional[Union[LRUCache[bytes, bytes], Dict[bytes, bytes]]] - PROJECT_KEY = "project_key" - INSTANCE_KEY = "instance_key" - BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" + BT_PROJECT_KEY = "bt_project_key" + BT_INSTANCE_KEY = "bt_instance_key" + BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" + BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" BT_COLUMN_NAME_KEY = "bt_column_name_key" - BT_ROW_FILTER_KEY = "bt_row_filter_key" + BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" def __init__( @@ -95,7 +98,7 @@ def _set_options(self, app, options) -> None: ) self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") self.row_filter = options.get( - BigTableStore.BT_ROW_FILTER_KEY, CellsColumnLimitFilter(1) + BigTableStore.BT_ROW_FILTERS_KEY, CellsColumnLimitFilter(1) ) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" @@ -105,11 +108,12 @@ def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": self.log.info("Setting up BigtableStartupCache") self._cache = BigtableStartupCache() - self.log.info("Start filling satrtup cache") + self.log.error("Start filling satrtup cache") self._cache.fill(self._iteritems()) - self.log.info( + self.log.error( "Finished setup of BigtableStartupCache. " - f"Has {len(self._cache)} entries" + f"Has {len(self._cache)} entries. " + f"First key is {list(self._cache.keys())[0]}" ) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) @@ -121,11 +125,11 @@ def _setup_value_cache(self) -> None: def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: Client = Client( - options.get(BigTableStore.PROJECT_KEY), + options.get(BigTableStore.BT_PROJECT_KEY), admin=True, ) self.instance: Instance = self.client.instance( - options.get(BigTableStore.INSTANCE_KEY) + options.get(BigTableStore.BT_INSTANCE_KEY) ) self.bt_table: Table = self.instance.table(self.bt_table_name) self.column_family_id = "FaustColumnFamily" @@ -133,6 +137,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=}" ) + # TODO: add columns families to options self.bt_table.create( column_families={ self.column_family_id: column_family.MaxVersionsGCRule(1) @@ -140,7 +145,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) else: logging.getLogger(__name__).info( - "BigTableStore: Using existing" + "BigTableStore: Using existing " f"bigtablestore with {self.bt_table_name=}" ) @@ -306,6 +311,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for row in self.bt_table.read_rows( start_key=start_key, end_key=end_key, + filter=self.row_filter, ): yield ( row.row_key[1:], From b9b4a763fccc5c086e22332bfc279774cdbb4b5f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 13:27:11 +0200 Subject: [PATCH 081/616] added keys --- faust/stores/bigtable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fcdaa9dcf..bb1092ad8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,6 @@ """BigTable storage.""" import logging -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -28,6 +28,9 @@ class BigtableStartupCache(): """ data: Dict = {} + def keys(self): + return self.data.keys() + def __len__(self): return len(self.data) From 2f2e7d4211bf4523ed4391e7bfdd38c4f6592117 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 13:50:10 +0200 Subject: [PATCH 082/616] log table name for cache --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bb1092ad8..f1cc75c6b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -114,9 +114,8 @@ def _setup_value_cache(self) -> None: self.log.error("Start filling satrtup cache") self._cache.fill(self._iteritems()) self.log.error( - "Finished setup of BigtableStartupCache. " + f"Finished setup of BigtableStartupCache for {self.table_name}. " f"Has {len(self._cache)} entries. " - f"First key is {list(self._cache.keys())[0]}" ) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) From 2a09022751cbc1e06339846e2361db1613426ac7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 14:09:19 +0200 Subject: [PATCH 083/616] moved init of cache --- faust/stores/bigtable.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f1cc75c6b..255715e82 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -21,11 +21,12 @@ def get_current_partition(): return event.message.partition -class BigtableStartupCache(): +class BigtableStartupCache: """ This is a dictionary which is only filled once, after that, every successful access to a key, will remove it. """ + data: Dict = {} def keys(self): @@ -86,7 +87,6 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) - self._setup_value_cache() def _set_options(self, app, options) -> None: self.table_name_generator = options.get( @@ -95,6 +95,7 @@ def _set_options(self, app, options) -> None: self.bt_start_key, self.bt_end_key = options.get( BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) + self._cache_setup_done = False self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size @@ -119,10 +120,9 @@ def _setup_value_cache(self) -> None: ) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) - elif self.value_cache_type is None: - self._cache = None else: raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") + self._cache_setup_done = True def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) @@ -197,10 +197,15 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: return range(self.app.conf.topic_partitions) def _get(self, key: bytes) -> Optional[bytes]: + # If the cache was not yet initialised, we want to do it here. + # This function will immediately abort if no cache is set + if not self._cache_setup_done: + self._setup_value_cache() + if self._cache is not None: if key in self._cache.keys(): self.log.info( - f"Took value from {key=} from cache, " + f"Took value with {key=} from cache of {self.table_name}, " f"cachesize={len(self._cache)}" ) return self._cache[key] From 631eba7157d3391ec1a21767aa3e7677956fe4cb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 14:11:19 +0200 Subject: [PATCH 084/616] remove read rtows --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 255715e82..38e5d4706 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -318,7 +318,6 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for row in self.bt_table.read_rows( start_key=start_key, end_key=end_key, - filter=self.row_filter, ): yield ( row.row_key[1:], From 652db834d265d57f0d5228c8ebbe4190785b687d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 14:34:18 +0200 Subject: [PATCH 085/616] set cache to none --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 38e5d4706..a7f633412 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -81,6 +81,7 @@ def __init__( ) -> None: self._set_options(app, options) self._key_index = LRUCache(limit=app.conf.table_key_index_size) + self._cache = None try: self._bigtable_setup(table, options) except Exception as ex: From 6e4740eb06f8b3a1496532e24236ce0927d8e0a8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 16:19:17 +0200 Subject: [PATCH 086/616] fixed get in bigtable --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a7f633412..0b9004dd0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -229,7 +229,8 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: self._key_index[key] = partition return value - raise KeyError + # No key was found + return None except KeyError as ke: self.log.error(f"KeyError in get for table {self.table_name} for {key=}") raise ke From 2cd10e04adf75ba8c847ec920b3c1e88c7e563c9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 16:33:00 +0200 Subject: [PATCH 087/616] Make error logs in cache infos --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0b9004dd0..53f8dcb39 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -111,11 +111,10 @@ def _set_options(self, app, options) -> None: def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": - self.log.info("Setting up BigtableStartupCache") self._cache = BigtableStartupCache() - self.log.error("Start filling satrtup cache") + self.log.info("Start filling satrtup cache") self._cache.fill(self._iteritems()) - self.log.error( + self.log.info( f"Finished setup of BigtableStartupCache for {self.table_name}. " f"Has {len(self._cache)} entries. " ) From 6daa2666628b3bcd0ea80ac3aadcb6254bcae1eb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 26 Sep 2022 16:44:49 +0200 Subject: [PATCH 088/616] removed log in get which was triggered on every request --- faust/stores/bigtable.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 53f8dcb39..e15ebd6e2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -204,10 +204,6 @@ def _get(self, key: bytes) -> Optional[bytes]: if self._cache is not None: if key in self._cache.keys(): - self.log.info( - f"Took value with {key=} from cache of {self.table_name}, " - f"cachesize={len(self._cache)}" - ) return self._cache[key] try: partition = self._maybe_get_partition_from_message() From 69c18bfcd4bc5b6c3a52f02c446fbb0546564563 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 10:40:41 +0200 Subject: [PATCH 089/616] added better logging --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e15ebd6e2..82b2651fa 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -112,10 +112,10 @@ def _set_options(self, app, options) -> None: def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": self._cache = BigtableStartupCache() - self.log.info("Start filling satrtup cache") + self.log.info(f"Start filling satrtup cache for {self.table_name}") self._cache.fill(self._iteritems()) self.log.info( - f"Finished setup of BigtableStartupCache for {self.table_name}. " + f"Finished setup of BigtableStartupCache for {self.table_name} " f"Has {len(self._cache)} entries. " ) elif self.value_cache_type == "forever": From f7b16c82f0824bc614ca0d4b5dbbfd915b7fb691 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 12:15:36 +0200 Subject: [PATCH 090/616] use _cache in _contains function --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 82b2651fa..ecf5f187b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -334,8 +334,9 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - if key in self._key_index: - return True + if self._cache is not None: + if key in self._cache.keys(): + return True partition = self._maybe_get_partition_from_message() if partition is not None: From b209d011ab76d924660ad20c094ed46e92f93782 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 13:21:41 +0200 Subject: [PATCH 091/616] added mutation buffer --- faust/stores/bigtable.py | 112 +++++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ecf5f187b..b67d8a032 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,10 +1,23 @@ """BigTable storage.""" import logging -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Tuple, Union +from collections import defaultdict +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Set, + Tuple, + Union, +) from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance +from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache @@ -21,6 +34,29 @@ def get_current_partition(): return event.message.partition +class BigtableMutationBuffer: + rows: Dict[int, Dict[bytes, Tuple[DirectRow, Optional[bytes]]]] + mutation_limit: int + + def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: + self.mutation_limit = mutation_limit + self.bigtable_table = bigtable_table + self.rows = defaultdict(dict) + + def full(self, partition: int) -> bool: + return len(self.rows[partition]) > self.mutation_limit + + def submit( + self, row: DirectRow, partition: int, value: Optional[bytes] = None + ): + self.rows[partition][row.row_key] = row, value + + def flush(self, partition): + mutations = list(zip(*self.rows[partition].values()))[0] + self.bigtable_table.mutate_rows(mutations) + self.rows[partition].clear() + + class BigtableStartupCache: """ This is a dictionary which is only filled once, after that, every @@ -61,6 +97,7 @@ class BigTableStore(base.SerializedStore): _key_index: LRUCache[bytes, int] _cache: Optional[Union[LRUCache[bytes, bytes], Dict[bytes, bytes]]] + _mutation_buffer: Optional[BigtableMutationBuffer] VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" BT_PROJECT_KEY = "bt_project_key" @@ -70,6 +107,8 @@ class BigTableStore(base.SerializedStore): BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" + BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" + BT_ENABLE_MUTATION_BUFER_KEY = "bt_enable_mutation_bufer_key" def __init__( self, @@ -82,8 +121,10 @@ def __init__( self._set_options(app, options) self._key_index = LRUCache(limit=app.conf.table_key_index_size) self._cache = None + self._mutation_buffer = None try: self._bigtable_setup(table, options) + self._setup_mutation_buffer(options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -97,11 +138,18 @@ def _set_options(self, app, options) -> None: BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) self._cache_setup_done = False - self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) + self.value_cache_type = options.get( + BigTableStore.VALUE_CACHE_TYPE_KEY, None + ) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size ) - self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") + self.mutation_buffer_enabled = options.get( + BigTableStore.BT_ENABLE_MUTATION_BUFER_KEY, False + ) + self.column_name = options.get( + BigTableStore.BT_COLUMN_NAME_KEY, "DATA" + ) self.row_filter = options.get( BigTableStore.BT_ROW_FILTERS_KEY, CellsColumnLimitFilter(1) ) @@ -109,6 +157,15 @@ def _set_options(self, app, options) -> None: BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" ) + def _setup_mutation_buffer(self, options) -> None: + if self.mutation_buffer_enabled: + limit = options.get( + BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 + ) + self._mutattion_buffer = BigtableMutationBuffer( + self.bt_table, limit + ) + def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": self._cache = BigtableStartupCache() @@ -121,7 +178,9 @@ def _setup_value_cache(self) -> None: elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: - raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") + raise NotImplementedError( + f"VALUE_CACHE_TYPE '{self.value_cache_type}'" + ) self._cache_setup_done = True def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -158,6 +217,10 @@ def _bigtbale_get(self, key: bytes): res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: return None + if self.mutation_buffer_enabled: + partition = int.from_bytes(key[0], "little") + if key in self._mutation_buffer.rows[partition].keys(): + return self._mutation_buffer.rows[partition][key][1] return self._bigtable_exrtact_row_data(res) def _bigtbale_set(self, key: bytes, value: Optional[bytes]): @@ -167,12 +230,20 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): self.column_name, value, ) - row.commit() + if self.mutation_buffer_enabled: + partition = int.from_bytes(key[0], "little") + self._mutation_buffer.submit(row, partition, value) + else: + row.commit() def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() - row.commit() + if self.mutation_buffer_enabled: + partition = int.from_bytes(key[0], "little") + self._mutation_buffer.submit(row, partition, value) + else: + row.commit() def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -227,7 +298,9 @@ def _get(self, key: bytes) -> Optional[bytes]: # No key was found return None except KeyError as ke: - self.log.error(f"KeyError in get for table {self.table_name} for {key=}") + self.log.error( + f"KeyError in get for table {self.table_name} for {key=}" + ) raise ke except Exception as ex: self.log.error( @@ -238,7 +311,9 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) self._bigtbale_set(key_with_partition, value) if self._cache is not None: self._cache[key] = value @@ -349,7 +424,9 @@ def _contains(self, key: bytes) -> bool: return True else: for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) + key = self._get_key_with_partition( + key, partition=partition + ) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True @@ -401,8 +478,14 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ try: - offset_key = self.get_offset_key(tp).encode() - self._bigtbale_set(offset_key, str(offset).encode()) + if self.mutation_buffer_enabled and not self.recovery_active: + if self._mutation_buffer.full(tp.partition): + self._mutation_buffer.flush(tp.partition) + offset_key = self.get_offset_key(tp).encode() + self._bigtbale_set(offset_key, str(offset).encode()) + else: + offset_key = self.get_offset_key(tp).encode() + self._bigtbale_set(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -410,6 +493,12 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: ) self.app._crash(e) + async def on_recovery_completed( + self, active_tps: Set[TP], standby_tps: Set[TP] + ) -> None: + self.recovery_active = False + return await super().on_recovery_completed(active_tps, standby_tps) + def _persist_changelog_batch(self, row_mutations, tp_offsets): response = self.bt_table.mutate_rows(row_mutations) for i, status in enumerate(response): @@ -434,6 +523,7 @@ def apply_changelog_batch( to_value: A callable you can use to deserialize the value of a changelog event. """ + self.recovery_active = True tp_offsets: Dict[TP, int] = {} row_mutations = [] for event in batch: From 69659fc9df172fe3fa2474f695f101e2e80aac4f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 13:42:24 +0200 Subject: [PATCH 092/616] fixed from byte --- faust/stores/bigtable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b67d8a032..c9c7c9338 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -218,7 +218,7 @@ def _bigtbale_get(self, key: bytes): if res is None: return None if self.mutation_buffer_enabled: - partition = int.from_bytes(key[0], "little") + partition = key[0] if key in self._mutation_buffer.rows[partition].keys(): return self._mutation_buffer.rows[partition][key][1] return self._bigtable_exrtact_row_data(res) @@ -231,7 +231,7 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): value, ) if self.mutation_buffer_enabled: - partition = int.from_bytes(key[0], "little") + partition = key[0] self._mutation_buffer.submit(row, partition, value) else: row.commit() @@ -240,8 +240,8 @@ def _bigtbale_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() if self.mutation_buffer_enabled: - partition = int.from_bytes(key[0], "little") - self._mutation_buffer.submit(row, partition, value) + partition = key[0] + self._mutation_buffer.submit(row, partition, None) else: row.commit() From 213c7bb06dd120990e6a339144d6b5db1ed10277 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 14:06:55 +0200 Subject: [PATCH 093/616] fixed typo --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c9c7c9338..685b572c4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -108,7 +108,7 @@ class BigTableStore(base.SerializedStore): BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" - BT_ENABLE_MUTATION_BUFER_KEY = "bt_enable_mutation_bufer_key" + BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" def __init__( self, @@ -145,7 +145,7 @@ def _set_options(self, app, options) -> None: BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size ) self.mutation_buffer_enabled = options.get( - BigTableStore.BT_ENABLE_MUTATION_BUFER_KEY, False + BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False ) self.column_name = options.get( BigTableStore.BT_COLUMN_NAME_KEY, "DATA" @@ -162,7 +162,7 @@ def _setup_mutation_buffer(self, options) -> None: limit = options.get( BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 ) - self._mutattion_buffer = BigtableMutationBuffer( + self._mutation_buffer = BigtableMutationBuffer( self.bt_table, limit ) From 05a3b8d45221053d8418d942d2e0c10936331981 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 14:24:25 +0200 Subject: [PATCH 094/616] added logging --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 685b572c4..574fd5ac7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -481,6 +481,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: if self.mutation_buffer_enabled and not self.recovery_active: if self._mutation_buffer.full(tp.partition): self._mutation_buffer.flush(tp.partition) + self.log.info(f"Flushed BigtableMutationBuffer partition={tp.partition}") offset_key = self.get_offset_key(tp).encode() self._bigtbale_set(offset_key, str(offset).encode()) else: From 8e3d29c62703ed326b50ac6f8a6ed3df28ad68d5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 14:32:11 +0200 Subject: [PATCH 095/616] removed recover_active flag --- faust/stores/bigtable.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 574fd5ac7..aed337b33 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -469,7 +469,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return offset return None - def set_persisted_offset(self, tp: TP, offset: int) -> None: + def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: """Set the last persisted offset for this table. This will remember the last offset that we wrote to BigTableStore, @@ -478,7 +478,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: we were not an active replica. """ try: - if self.mutation_buffer_enabled and not self.recovery_active: + if self.mutation_buffer_enabled and not recovery: if self._mutation_buffer.full(tp.partition): self._mutation_buffer.flush(tp.partition) self.log.info(f"Flushed BigtableMutationBuffer partition={tp.partition}") @@ -494,12 +494,6 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: ) self.app._crash(e) - async def on_recovery_completed( - self, active_tps: Set[TP], standby_tps: Set[TP] - ) -> None: - self.recovery_active = False - return await super().on_recovery_completed(active_tps, standby_tps) - def _persist_changelog_batch(self, row_mutations, tp_offsets): response = self.bt_table.mutate_rows(row_mutations) for i, status in enumerate(response): @@ -507,7 +501,7 @@ def _persist_changelog_batch(self, row_mutations, tp_offsets): self.log.error("Row number {} failed to write".format(i)) for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset) + self.set_persisted_offset(tp, offset, recovery=True) def apply_changelog_batch( self, @@ -524,7 +518,6 @@ def apply_changelog_batch( to_value: A callable you can use to deserialize the value of a changelog event. """ - self.recovery_active = True tp_offsets: Dict[TP, int] = {} row_mutations = [] for event in batch: From 3902b0f343b892947ddba098956b3cf3c0b3d6c1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:03:17 +0200 Subject: [PATCH 096/616] utilize mutation buffer --- faust/stores/bigtable.py | 93 +++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index aed337b33..6cb9091ca 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -46,9 +46,7 @@ def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: def full(self, partition: int) -> bool: return len(self.rows[partition]) > self.mutation_limit - def submit( - self, row: DirectRow, partition: int, value: Optional[bytes] = None - ): + def submit(self, row: DirectRow, partition: int, value: Optional[bytes] = None): self.rows[partition][row.row_key] = row, value def flush(self, partition): @@ -138,18 +136,14 @@ def _set_options(self, app, options) -> None: BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) self._cache_setup_done = False - self.value_cache_type = options.get( - BigTableStore.VALUE_CACHE_TYPE_KEY, None - ) + self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size ) self.mutation_buffer_enabled = options.get( BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False ) - self.column_name = options.get( - BigTableStore.BT_COLUMN_NAME_KEY, "DATA" - ) + self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") self.row_filter = options.get( BigTableStore.BT_ROW_FILTERS_KEY, CellsColumnLimitFilter(1) ) @@ -159,12 +153,8 @@ def _set_options(self, app, options) -> None: def _setup_mutation_buffer(self, options) -> None: if self.mutation_buffer_enabled: - limit = options.get( - BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 - ) - self._mutation_buffer = BigtableMutationBuffer( - self.bt_table, limit - ) + limit = options.get(BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100) + self._mutation_buffer = BigtableMutationBuffer(self.bt_table, limit) def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": @@ -178,9 +168,7 @@ def _setup_value_cache(self) -> None: elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: - raise NotImplementedError( - f"VALUE_CACHE_TYPE '{self.value_cache_type}'" - ) + raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") self._cache_setup_done = True def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -214,35 +202,55 @@ def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value def _bigtbale_get(self, key: bytes): - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is None: - return None - if self.mutation_buffer_enabled: - partition = key[0] - if key in self._mutation_buffer.rows[partition].keys(): - return self._mutation_buffer.rows[partition][key][1] - return self._bigtable_exrtact_row_data(res) + partition = key[0] + if ( + self.mutation_buffer_enabled + and key in self._mutation_buffer.rows[partition].keys() + ): + return self._mutation_buffer.rows[partition][key][1] + else: + res = self.bt_table.read_row(key, filter_=self.row_filter) + if res is None: + return None + return self._bigtable_exrtact_row_data(res) def _bigtbale_set(self, key: bytes, value: Optional[bytes]): row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) if self.mutation_buffer_enabled: + row: DirectRow partition = key[0] + if key in self._mutation_buffer.rows[partition].keys(): + row = self._mutation_buffer.rows[partition][key] + else: + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) self._mutation_buffer.submit(row, partition, value) else: + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) row.commit() def _bigtbale_del(self, key: bytes): - row = self.bt_table.direct_row(key) - row.delete() if self.mutation_buffer_enabled: + row: DirectRow partition = key[0] + if key in self._mutation_buffer.rows[partition].keys(): + row = self._mutation_buffer.rows[partition][key] + else: + row = self.bt_table.direct_row(key) + row.delete() self._mutation_buffer.submit(row, partition, None) else: + row = self.bt_table.direct_row(key) + row.delete() row.commit() def _maybe_get_partition_from_message(self) -> Optional[int]: @@ -298,9 +306,7 @@ def _get(self, key: bytes) -> Optional[bytes]: # No key was found return None except KeyError as ke: - self.log.error( - f"KeyError in get for table {self.table_name} for {key=}" - ) + self.log.error(f"KeyError in get for table {self.table_name} for {key=}") raise ke except Exception as ex: self.log.error( @@ -311,9 +317,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_key_with_partition( - key, partition=partition - ) + key_with_partition = self._get_key_with_partition(key, partition=partition) self._bigtbale_set(key_with_partition, value) if self._cache is not None: self._cache[key] = value @@ -419,14 +423,15 @@ def _contains(self, key: bytes) -> bool: key, partition=partition, ) + if self.mutation_buffer_enabled: + if key in self._mutation_buffer.rows[partition][key]: + return True res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True else: for partition in self._partitions_for_key(key): - key = self._get_key_with_partition( - key, partition=partition - ) + key = self._get_key_with_partition(key, partition=partition) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True @@ -481,7 +486,9 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: if self.mutation_buffer_enabled and not recovery: if self._mutation_buffer.full(tp.partition): self._mutation_buffer.flush(tp.partition) - self.log.info(f"Flushed BigtableMutationBuffer partition={tp.partition}") + self.log.info( + f"Flushed BigtableMutationBuffer partition={tp.partition}" + ) offset_key = self.get_offset_key(tp).encode() self._bigtbale_set(offset_key, str(offset).encode()) else: From ec113406bca79af0b3c5c4c58ecdc38d9e7446e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:18:49 +0200 Subject: [PATCH 097/616] fixed wrong set in mutation buffer --- faust/stores/bigtable.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6cb9091ca..0a467d0ae 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,9 +7,7 @@ Dict, Iterable, Iterator, - List, Optional, - Set, Tuple, Union, ) @@ -220,7 +218,7 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes]): row: DirectRow partition = key[0] if key in self._mutation_buffer.rows[partition].keys(): - row = self._mutation_buffer.rows[partition][key] + row = self._mutation_buffer.rows[partition][key][0] else: row = self.bt_table.direct_row(key) row.set_cell( From c3223bffa159028d037b1c1f49fbdfa06cdd6ea9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:20:58 +0200 Subject: [PATCH 098/616] fixed delete --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0a467d0ae..ec07287a7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -241,7 +241,7 @@ def _bigtbale_del(self, key: bytes): row: DirectRow partition = key[0] if key in self._mutation_buffer.rows[partition].keys(): - row = self._mutation_buffer.rows[partition][key] + row = self._mutation_buffer.rows[partition][key][0] else: row = self.bt_table.direct_row(key) row.delete() From 60ab2227cc44883171c461744175f09cf16fe64c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:23:10 +0200 Subject: [PATCH 099/616] fixed contains --- faust/stores/bigtable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ec07287a7..f66d4f22e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -422,7 +422,10 @@ def _contains(self, key: bytes) -> bool: partition=partition, ) if self.mutation_buffer_enabled: - if key in self._mutation_buffer.rows[partition][key]: + if ( + key in self._mutation_buffer.rows[partition] + and self._mutation_buffer.rows[partition][key] is not None + ): return True res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: From e55db4ff221da7158a4534daacb0396d0aeefcbe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:26:31 +0200 Subject: [PATCH 100/616] fixed contains --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f66d4f22e..8c9ba094c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -424,7 +424,7 @@ def _contains(self, key: bytes) -> bool: if self.mutation_buffer_enabled: if ( key in self._mutation_buffer.rows[partition] - and self._mutation_buffer.rows[partition][key] is not None + and self._mutation_buffer.rows[partition][key][1] is not None ): return True res = self.bt_table.read_row(key, filter_=self.row_filter) From db44632d7ba3fc7ee2548059aea8eb424f7e1d74 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:55:53 +0200 Subject: [PATCH 101/616] persist offset immediately --- faust/stores/bigtable.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8c9ba094c..18787f6a5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -212,9 +212,8 @@ def _bigtbale_get(self, key: bytes): return None return self._bigtable_exrtact_row_data(res) - def _bigtbale_set(self, key: bytes, value: Optional[bytes]): - row = self.bt_table.direct_row(key) - if self.mutation_buffer_enabled: + def _bigtbale_set(self, key: bytes, value: Optional[bytes], persist_offset=False): + if self.mutation_buffer_enabled and not persist_offset: row: DirectRow partition = key[0] if key in self._mutation_buffer.rows[partition].keys(): @@ -488,13 +487,14 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: if self._mutation_buffer.full(tp.partition): self._mutation_buffer.flush(tp.partition) self.log.info( - f"Flushed BigtableMutationBuffer partition={tp.partition}" + f"Flushed BigtableMutationBuffer partition={tp.partition} " + f"for table {self.table_name}" ) offset_key = self.get_offset_key(tp).encode() - self._bigtbale_set(offset_key, str(offset).encode()) + self._bigtbale_set(offset_key, str(offset).encode(), persist_offset=True) else: offset_key = self.get_offset_key(tp).encode() - self._bigtbale_set(offset_key, str(offset).encode()) + self._bigtbale_set(offset_key, str(offset).encode(), persist_offset=True) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From fcd8aa08f004c520dd63226861bcb648dcde0d58 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 27 Sep 2022 15:59:31 +0200 Subject: [PATCH 102/616] fixe typos --- faust/stores/bigtable.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 18787f6a5..095bd39b3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -199,7 +199,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): def _bigtable_exrtact_row_data(self, row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtbale_get(self, key: bytes): + def _bigtable_get(self, key: bytes): partition = key[0] if ( self.mutation_buffer_enabled @@ -212,7 +212,7 @@ def _bigtbale_get(self, key: bytes): return None return self._bigtable_exrtact_row_data(res) - def _bigtbale_set(self, key: bytes, value: Optional[bytes], persist_offset=False): + def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): if self.mutation_buffer_enabled and not persist_offset: row: DirectRow partition = key[0] @@ -235,7 +235,7 @@ def _bigtbale_set(self, key: bytes, value: Optional[bytes], persist_offset=False ) row.commit() - def _bigtbale_del(self, key: bytes): + def _bigtable_del(self, key: bytes): if self.mutation_buffer_enabled: row: DirectRow partition = key[0] @@ -287,7 +287,7 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition = self._get_key_with_partition( key, partition=partition ) - value = self._bigtbale_get(key_with_partition) + value = self._bigtable_get(key_with_partition) if value is not None: self._key_index[key] = partition return value @@ -296,7 +296,7 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition = self._get_key_with_partition( key, partition=partition ) - value = self._bigtbale_get(key_with_partition) + value = self._bigtable_get(key_with_partition) if value is not None: self._key_index[key] = partition return value @@ -315,7 +315,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() key_with_partition = self._get_key_with_partition(key, partition=partition) - self._bigtbale_set(key_with_partition, value) + self._bigtable_set(key_with_partition, value) if self._cache is not None: self._cache[key] = value self._key_index[key] = partition @@ -332,7 +332,7 @@ def _del(self, key: bytes) -> None: key_with_partition = self._get_key_with_partition( key, partition=partition ) - self._bigtbale_del(key_with_partition) + self._bigtable_del(key_with_partition) if self._cache is not None: if key in self._cache: @@ -491,10 +491,10 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: f"for table {self.table_name}" ) offset_key = self.get_offset_key(tp).encode() - self._bigtbale_set(offset_key, str(offset).encode(), persist_offset=True) + self._bigtable_set(offset_key, str(offset).encode(), persist_offset=True) else: offset_key = self.get_offset_key(tp).encode() - self._bigtbale_set(offset_key, str(offset).encode(), persist_offset=True) + self._bigtable_set(offset_key, str(offset).encode(), persist_offset=True) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From bc825e550b41bda5c0068f8809b76e4668f5c874 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 15:06:56 +0200 Subject: [PATCH 103/616] fixed wrong caches --- faust/stores/bigtable.py | 137 +++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 56 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 095bd39b3..933feac98 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -33,24 +33,27 @@ def get_current_partition(): class BigtableMutationBuffer: - rows: Dict[int, Dict[bytes, Tuple[DirectRow, Optional[bytes]]]] + rows: Dict[bytes, Tuple[DirectRow, Optional[bytes]]] mutation_limit: int def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: self.mutation_limit = mutation_limit self.bigtable_table = bigtable_table - self.rows = defaultdict(dict) + self.rows = {} - def full(self, partition: int) -> bool: - return len(self.rows[partition]) > self.mutation_limit + def full(self) -> bool: + return len(self.rows) > self.mutation_limit - def submit(self, row: DirectRow, partition: int, value: Optional[bytes] = None): - self.rows[partition][row.row_key] = row, value + def submit(self, row: DirectRow, value: Optional[bytes] = None): + self.rows[row.row_key] = row, value - def flush(self, partition): - mutations = list(zip(*self.rows[partition].values()))[0] + def flush(self): + mutations = list(zip(*self.rows.values()))[0] self.bigtable_table.mutate_rows(mutations) - self.rows[partition].clear() + self.rows.clear() + + def get(self, key: bytes) -> Tuple[DirectRow, Optional[bytes]]: + return self.rows[key] class BigtableStartupCache: @@ -60,6 +63,7 @@ class BigtableStartupCache: """ data: Dict = {} + _filled_partitions = {} def keys(self): return self.data.keys() @@ -68,20 +72,28 @@ def __len__(self): return len(self.data) def __getitem__(self, key): - value = self.data.pop(key) - return value + return self.data.pop(key, None) def __setitem__(self, key, _) -> None: if key in self.data.keys(): self.data.pop(key, None) def __delitem__(self, key): - if key in self.data.keys(): - self.data.pop(key, None) + self.data.pop(key, None) - def fill(self, iter: Iterator[Tuple[bytes, bytes]]) -> None: - for k, v in iter: - self.data[k] = v + def filled(self, partition): + return self._filled_partitions.get(partition, False) + + def fill(self, table, partition) -> None: + start_key = partition.to_bytes(1, "little") + end_key = (partition + 1).to_bytes(1, "little") + for row in table.read_rows( + start_key=start_key, + end_key=end_key, + ): + row_val = BigTableStore.bigtable_exrtact_row_data(row) + self.data[row.row_key] = row_val + self._filled_partitions[partition] = True class BigTableStore(base.SerializedStore): @@ -133,7 +145,7 @@ def _set_options(self, app, options) -> None: self.bt_start_key, self.bt_end_key = options.get( BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) - self._cache_setup_done = False + self._cache_setup_done = {} self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size @@ -157,18 +169,39 @@ def _setup_mutation_buffer(self, options) -> None: def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": self._cache = BigtableStartupCache() - self.log.info(f"Start filling satrtup cache for {self.table_name}") - self._cache.fill(self._iteritems()) - self.log.info( - f"Finished setup of BigtableStartupCache for {self.table_name} " - f"Has {len(self._cache)} entries. " - ) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") self._cache_setup_done = True + def _cache_set(self, key: bytes, row: DirectRow, value: Optional[bytes]) -> None: + if self._cache: + self._cache[key] = value + if self.mutation_buffer_enabled: + self._mutation_buffer.submit(row, value) + + def _cache_del(self, key: bytes, row: DirectRow) -> None: + if self.mutation_buffer_enabled: + row = self._mutation_buffer.get(key)[0] + self._mutation_buffer.submit(row, None) + if self._cache: + del self._cache[key] + + def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: + row = None + value = None + if self.mutation_buffer_enabled: + row, value = self._mutation_buffer.get(key) + if self._cache: + if self.value_cache_type == "startup": + partition = key[0] + if not self._cache.filled(partition): + self._cache.fill(self.bt_table, partition) + if key in self._cache: + value = self._cache[key] + return row, value + def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: Client = Client( @@ -196,36 +229,35 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): f"bigtablestore with {self.bt_table_name=}" ) - def _bigtable_exrtact_row_data(self, row_data): + @staticmethod + def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes): + row, value = self._cache_get(key) partition = key[0] - if ( - self.mutation_buffer_enabled - and key in self._mutation_buffer.rows[partition].keys() - ): - return self._mutation_buffer.rows[partition][key][1] + + if value is not None: + return value + elif row is not None and value is None: + return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: return None - return self._bigtable_exrtact_row_data(res) + return self.bigtable_exrtact_row_data(res) def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): - if self.mutation_buffer_enabled and not persist_offset: - row: DirectRow - partition = key[0] - if key in self._mutation_buffer.rows[partition].keys(): - row = self._mutation_buffer.rows[partition][key][0] - else: + if not persist_offset: + row, value = self._cache_get(key) + if row is None: row = self.bt_table.direct_row(key) row.set_cell( self.column_family_id, self.column_name, value, ) - self._mutation_buffer.submit(row, partition, value) + self._cache_set(key, row, value) else: row = self.bt_table.direct_row(key) row.set_cell( @@ -236,17 +268,11 @@ def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False row.commit() def _bigtable_del(self, key: bytes): - if self.mutation_buffer_enabled: - row: DirectRow - partition = key[0] - if key in self._mutation_buffer.rows[partition].keys(): - row = self._mutation_buffer.rows[partition][key][0] - else: - row = self.bt_table.direct_row(key) - row.delete() - self._mutation_buffer.submit(row, partition, None) - else: + row = self._cache_get(key)[0] + if row is None: row = self.bt_table.direct_row(key) + self._cache_del(key, row) + if not self.mutation_buffer_enabled: row.delete() row.commit() @@ -275,12 +301,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: # If the cache was not yet initialised, we want to do it here. # This function will immediately abort if no cache is set - if not self._cache_setup_done: - self._setup_value_cache() - if self._cache is not None: - if key in self._cache.keys(): - return self._cache[key] try: partition = self._maybe_get_partition_from_message() if partition is not None: @@ -394,7 +415,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: ): yield ( row.row_key[1:], - self._bigtable_exrtact_row_data(row), + self.bigtable_exrtact_row_data(row), ) except Exception as ex: self.log.error( @@ -470,7 +491,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: offset_key = self.get_offset_key(tp) row_res = self.bt_table.read_row(offset_key, filter_=self.row_filter) if row_res is not None: - offset = int(self._bigtable_exrtact_row_data(row_res)) + offset = int(self.bigtable_exrtact_row_data(row_res)) return offset return None @@ -491,10 +512,14 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: f"for table {self.table_name}" ) offset_key = self.get_offset_key(tp).encode() - self._bigtable_set(offset_key, str(offset).encode(), persist_offset=True) + self._bigtable_set( + offset_key, str(offset).encode(), persist_offset=True + ) else: offset_key = self.get_offset_key(tp).encode() - self._bigtable_set(offset_key, str(offset).encode(), persist_offset=True) + self._bigtable_set( + offset_key, str(offset).encode(), persist_offset=True + ) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From 5c92f5c43ae29bbbc78efadaca1b05dbc0b82b1d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 15:18:53 +0200 Subject: [PATCH 104/616] fixed contains with caching --- faust/stores/bigtable.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 933feac98..2f632bbfd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,5 @@ """BigTable storage.""" import logging -from collections import defaultdict from typing import ( Any, Callable, @@ -104,7 +103,7 @@ class BigTableStore(base.SerializedStore): bt_table: Table _key_index: LRUCache[bytes, int] - _cache: Optional[Union[LRUCache[bytes, bytes], Dict[bytes, bytes]]] + _cache: Optional[Union[LRUCache[bytes, bytes], BigtableStartupCache]] _mutation_buffer: Optional[BigtableMutationBuffer] VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" @@ -133,6 +132,7 @@ def __init__( try: self._bigtable_setup(table, options) self._setup_mutation_buffer(options) + self._setup_value_cache() except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -337,8 +337,6 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: partition = get_current_partition() key_with_partition = self._get_key_with_partition(key, partition=partition) self._bigtable_set(key_with_partition, value) - if self._cache is not None: - self._cache[key] = value self._key_index[key] = partition except Exception as ex: self.log.error( @@ -355,9 +353,6 @@ def _del(self, key: bytes) -> None: ) self._bigtable_del(key_with_partition) - if self._cache is not None: - if key in self._cache: - del self._cache[key] if key in self._key_index: del self._key_index[key] except Exception as ex: @@ -431,28 +426,29 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - if self._cache is not None: - if key in self._cache.keys(): - return True - partition = self._maybe_get_partition_from_message() if partition is not None: key = self._get_key_with_partition( key, partition=partition, ) - if self.mutation_buffer_enabled: - if ( - key in self._mutation_buffer.rows[partition] - and self._mutation_buffer.rows[partition][key][1] is not None - ): - return True + row, val = self._cache_get(key) + if row is not None and val is None: + return False + elif val is not None: + return True + res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True else: for partition in self._partitions_for_key(key): key = self._get_key_with_partition(key, partition=partition) + row, val = self._cache_get(key) + if row is not None and val is None: + return False + elif val is not None: + return True res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True From a20838246aaeb511fc7bcad0ab5ef5889cfc4182 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 15:42:22 +0200 Subject: [PATCH 105/616] fixed get and del for caches --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2f632bbfd..e15bedcbb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -183,7 +183,6 @@ def _cache_set(self, key: bytes, row: DirectRow, value: Optional[bytes]) -> None def _cache_del(self, key: bytes, row: DirectRow) -> None: if self.mutation_buffer_enabled: - row = self._mutation_buffer.get(key)[0] self._mutation_buffer.submit(row, None) if self._cache: del self._cache[key] @@ -192,7 +191,8 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None if self.mutation_buffer_enabled: - row, value = self._mutation_buffer.get(key) + if key in self._mutation_buffer.rows.keys(): + row, value = self._mutation_buffer.get(key) if self._cache: if self.value_cache_type == "startup": partition = key[0] From ec351d8c6a1c8f0c2c92de5202447495fa24a347 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 16:31:24 +0200 Subject: [PATCH 106/616] fixed get request from mutation buffer --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e15bedcbb..1b704160c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -52,7 +52,7 @@ def flush(self): self.rows.clear() def get(self, key: bytes) -> Tuple[DirectRow, Optional[bytes]]: - return self.rows[key] + return self.rows.get(key, (None, None)) class BigtableStartupCache: @@ -191,8 +191,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None if self.mutation_buffer_enabled: - if key in self._mutation_buffer.rows.keys(): - row, value = self._mutation_buffer.get(key) + row, value = self._mutation_buffer.get(key) if self._cache: if self.value_cache_type == "startup": partition = key[0] From d7b44159a3f2c1376db377cc2557a3b33458bc48 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 16:55:18 +0200 Subject: [PATCH 107/616] added tracebacks --- faust/stores/bigtable.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1b704160c..e79bd6e88 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,5 +1,6 @@ """BigTable storage.""" import logging +import traceback from typing import ( Any, Callable, @@ -51,7 +52,7 @@ def flush(self): self.bigtable_table.mutate_rows(mutations) self.rows.clear() - def get(self, key: bytes) -> Tuple[DirectRow, Optional[bytes]]: + def get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: return self.rows.get(key, (None, None)) @@ -340,7 +341,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " - f"table {self.table_name} exception {ex} key {key}" + f"table {self.table_name} exception {ex} key {key} " + f"Traceback: {traceback.format_exc()}" ) raise ex @@ -456,7 +458,8 @@ def _contains(self, key: bytes) -> bool: self.log.error( f"FaustBigtableException Error in _contains for table " f"{self.table_name} exception " - f"{ex} key {key}" + f"{ex} key {key}. " + f"Traceback: {traceback.format_exc()}" ) raise ex From 1f7e6181a7f553ebae828695b7136cd1dd12655e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 16:56:28 +0200 Subject: [PATCH 108/616] call get directly --- faust/stores/bigtable.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e79bd6e88..8070378de 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -52,9 +52,6 @@ def flush(self): self.bigtable_table.mutate_rows(mutations) self.rows.clear() - def get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: - return self.rows.get(key, (None, None)) - class BigtableStartupCache: """ @@ -192,7 +189,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None if self.mutation_buffer_enabled: - row, value = self._mutation_buffer.get(key) + row, value = self._mutation_buffer.rows.get(key, tuple(None, None)) if self._cache: if self.value_cache_type == "startup": partition = key[0] From 0bc9ec036ef3082780a0f5c4b7844c5106dd84f9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 17:10:04 +0200 Subject: [PATCH 109/616] fixed get --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8070378de..73c289c31 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -189,7 +189,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None if self.mutation_buffer_enabled: - row, value = self._mutation_buffer.rows.get(key, tuple(None, None)) + row, value = self._mutation_buffer.rows.get(key, (None, None)) if self._cache: if self.value_cache_type == "startup": partition = key[0] From 802f0bde5cd2086f6fae34b9fec8295cbbc85c6e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 17:30:06 +0200 Subject: [PATCH 110/616] fixed wrong set in bigtable --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 73c289c31..1b38e17ef 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -246,7 +246,7 @@ def _bigtable_get(self, key: bytes): def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): if not persist_offset: - row, value = self._cache_get(key) + row, _ = self._cache_get(key) if row is None: row = self.bt_table.direct_row(key) row.set_cell( From d5cad2b0b119225837d6f9faf0efdb1c1104c8fc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 28 Sep 2022 17:47:31 +0200 Subject: [PATCH 111/616] fixed mutation buffer flush --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1b38e17ef..a418cf2b4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -500,8 +500,8 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: """ try: if self.mutation_buffer_enabled and not recovery: - if self._mutation_buffer.full(tp.partition): - self._mutation_buffer.flush(tp.partition) + if self._mutation_buffer.full(): + self._mutation_buffer.flush() self.log.info( f"Flushed BigtableMutationBuffer partition={tp.partition} " f"for table {self.table_name}" From c6c56da51410a1a8916722e49ab93855c145f64c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 09:00:29 +0200 Subject: [PATCH 112/616] better logging --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a418cf2b4..23b4ad8bc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -501,10 +501,11 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: try: if self.mutation_buffer_enabled and not recovery: if self._mutation_buffer.full(): + num_mutations = len(self._mutation_buffer.rows) self._mutation_buffer.flush() self.log.info( - f"Flushed BigtableMutationBuffer partition={tp.partition} " - f"for table {self.table_name}" + f"Flushed BigtableMutationBuffer with {num_mutations} " + f"mutations for table {self.table_name}" ) offset_key = self.get_offset_key(tp).encode() self._bigtable_set( From dc7fbb2b5f8b4313fb9f9ad424c1d32ecb81f40e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 10:23:54 +0200 Subject: [PATCH 113/616] added logging --- faust/stores/bigtable.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 23b4ad8bc..f5a41d38c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -196,6 +196,11 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: if not self._cache.filled(partition): self._cache.fill(self.bt_table, partition) if key in self._cache: + self.log.info( + f"Took value from startup cache, " + f"remaining size: {len(self._cache.data)} " + f"for table {self.table_name}:{partition}" + ) value = self._cache[key] return row, value @@ -406,6 +411,14 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: start_key=start_key, end_key=end_key, ): + if self.mutation_buffer_enabled: + # We want to yield the mutation if any us buffered + mut_row, value = self._mutation_buffer.rows.get( + row.row_key, (None, None) + ) + if mut_row is not None and value is not None: + yield (row.row_key[1:], value) + continue yield ( row.row_key[1:], self.bigtable_exrtact_row_data(row), From 30bda4227b6ce7b9df9912a009708b5ec8b710de Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 13:22:11 +0200 Subject: [PATCH 114/616] fixed cache get --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f5a41d38c..51dcb8c67 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,7 +195,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: partition = key[0] if not self._cache.filled(partition): self._cache.fill(self.bt_table, partition) - if key in self._cache: + if key in self._cache.data(): self.log.info( f"Took value from startup cache, " f"remaining size: {len(self._cache.data)} " From 43c6d3ea0d90d0d0c508e38966d2926e6ebfad15 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 13:24:01 +0200 Subject: [PATCH 115/616] fixed cache get --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 51dcb8c67..1e3dee449 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,10 +195,10 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: partition = key[0] if not self._cache.filled(partition): self._cache.fill(self.bt_table, partition) - if key in self._cache.data(): + if key in self._cache.keys(): self.log.info( f"Took value from startup cache, " - f"remaining size: {len(self._cache.data)} " + f"remaining size: {len(self._cache.data)} " # TODO: REMOVE f"for table {self.table_name}:{partition}" ) value = self._cache[key] From 6c42f71eb4f3977a85611a266d5f12e1a9b074e9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 15:03:09 +0200 Subject: [PATCH 116/616] logging for startup cache --- faust/stores/bigtable.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1e3dee449..74237c577 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -62,6 +62,9 @@ class BigtableStartupCache: data: Dict = {} _filled_partitions = {} + def __init__(self) -> None: + self.log = logging.getLogger(self.__class__.__name__) + def keys(self): return self.data.keys() @@ -91,6 +94,10 @@ def fill(self, table, partition) -> None: row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val self._filled_partitions[partition] = True + self.log.info( + f"Filled BigtableStartupCache with {len(self.data)} " + "entries" + ) class BigTableStore(base.SerializedStore): @@ -143,7 +150,6 @@ def _set_options(self, app, options) -> None: self.bt_start_key, self.bt_end_key = options.get( BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) - self._cache_setup_done = {} self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size @@ -171,10 +177,9 @@ def _setup_value_cache(self) -> None: self._cache = LRUCache(limit=self.value_cache_size) else: raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") - self._cache_setup_done = True - def _cache_set(self, key: bytes, row: DirectRow, value: Optional[bytes]) -> None: - if self._cache: + def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: + if self._cache is not None: self._cache[key] = value if self.mutation_buffer_enabled: self._mutation_buffer.submit(row, value) @@ -198,7 +203,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: if key in self._cache.keys(): self.log.info( f"Took value from startup cache, " - f"remaining size: {len(self._cache.data)} " # TODO: REMOVE + f"remaining size: {len(self._cache.data)} " # TODO: REMOVE f"for table {self.table_name}:{partition}" ) value = self._cache[key] @@ -251,7 +256,7 @@ def _bigtable_get(self, key: bytes): def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): if not persist_offset: - row, _ = self._cache_get(key) + row = self._cache_get(key)[0] if row is None: row = self.bt_table.direct_row(key) row.set_cell( @@ -270,13 +275,14 @@ def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False row.commit() def _bigtable_del(self, key: bytes): - row = self._cache_get(key)[0] - if row is None: - row = self.bt_table.direct_row(key) - self._cache_del(key, row) if not self.mutation_buffer_enabled: row.delete() row.commit() + else: + row = self._cache_get(key)[0] + if row is None: + row = self.bt_table.direct_row(key) + self._cache_del(key, row) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -516,10 +522,16 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: if self._mutation_buffer.full(): num_mutations = len(self._mutation_buffer.rows) self._mutation_buffer.flush() + self.log.info( f"Flushed BigtableMutationBuffer with {num_mutations} " f"mutations for table {self.table_name}" ) + if self.value_cache_type is "startup": + self.log.info( + "Current size of ValueCache is:" + f"{self._cache.data}" + ) offset_key = self.get_offset_key(tp).encode() self._bigtable_set( offset_key, str(offset).encode(), persist_offset=True From ca11ef2bbba1d2c2fbdf6b6b9d89b2aa35c13083 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 15:03:55 +0200 Subject: [PATCH 117/616] added tablename to log message --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 74237c577..a5d3d61cc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -529,8 +529,8 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: ) if self.value_cache_type is "startup": self.log.info( - "Current size of ValueCache is:" - f"{self._cache.data}" + f"Current size of ValueCache for {self.table_name}" + f" is:{self._cache.data}" ) offset_key = self.get_offset_key(tp).encode() self._bigtable_set( From a027b5adb1e35666b467e55a4c0b7fa85f263a18 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 15:27:28 +0200 Subject: [PATCH 118/616] startup cache logging --- faust/stores/bigtable.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a5d3d61cc..c8fdea5ea 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -42,7 +42,7 @@ def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: self.rows = {} def full(self) -> bool: - return len(self.rows) > self.mutation_limit + return len(self.rows) >= self.mutation_limit def submit(self, row: DirectRow, value: Optional[bytes] = None): self.rows[row.row_key] = row, value @@ -87,17 +87,15 @@ def filled(self, partition): def fill(self, table, partition) -> None: start_key = partition.to_bytes(1, "little") end_key = (partition + 1).to_bytes(1, "little") + self.log.info(f"Will fill BigtableStartupCache with {len(self.data)}...") for row in table.read_rows( start_key=start_key, end_key=end_key, ): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val + self.log.info(f"Filled BigtableStartupCache with {len(self.data)} entries") self._filled_partitions[partition] = True - self.log.info( - f"Filled BigtableStartupCache with {len(self.data)} " - "entries" - ) class BigTableStore(base.SerializedStore): @@ -242,8 +240,6 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes): row, value = self._cache_get(key) - partition = key[0] - if value is not None: return value elif row is not None and value is None: From 865224b0b348bd280feeb27d1c5048476200497a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 16:31:18 +0200 Subject: [PATCH 119/616] faster mutation buffer --- faust/stores/bigtable.py | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c8fdea5ea..58fea85ed 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,6 +7,7 @@ Dict, Iterable, Iterator, + List, Optional, Tuple, Union, @@ -37,10 +38,22 @@ class BigtableMutationBuffer: mutation_limit: int def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: - self.mutation_limit = mutation_limit - self.bigtable_table = bigtable_table + self.mutation_limit: int = mutation_limit + self.bigtable_table: Table = bigtable_table self.rows = {} + def _apply_mutations(self) -> None: + for row, val in self.rows.values(): + if val is None: + row.delete() + else: + column_family = list(self.bigtable_table.list_column_families().keys())[0] + row.set_cell( + column_family, + "DATA", + val, + ) + def full(self) -> bool: return len(self.rows) >= self.mutation_limit @@ -48,8 +61,9 @@ def submit(self, row: DirectRow, value: Optional[bytes] = None): self.rows[row.row_key] = row, value def flush(self): - mutations = list(zip(*self.rows.values()))[0] - self.bigtable_table.mutate_rows(mutations) + rows = list(zip(*self.rows.values()))[0] + self._apply_mutations() + self.bigtable_table.mutate_rows(rows) self.rows.clear() @@ -59,11 +73,11 @@ class BigtableStartupCache: successful access to a key, will remove it. """ - data: Dict = {} - _filled_partitions = {} def __init__(self) -> None: self.log = logging.getLogger(self.__class__.__name__) + self._filled_partitions = {} + self.data: Dict = {} def keys(self): return self.data.keys() @@ -87,14 +101,19 @@ def filled(self, partition): def fill(self, table, partition) -> None: start_key = partition.to_bytes(1, "little") end_key = (partition + 1).to_bytes(1, "little") - self.log.info(f"Will fill BigtableStartupCache with {len(self.data)}...") + self.log.info( + f"Will fill BigtableStartupCache with {len(self.data)}..." + ) for row in table.read_rows( start_key=start_key, end_key=end_key, ): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self.log.info(f"Filled BigtableStartupCache with {len(self.data)} entries") + self.log.info( + f"Filled BigtableStartupCache with {len(self.data)} " + "entries" + ) self._filled_partitions[partition] = True @@ -271,13 +290,12 @@ def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False row.commit() def _bigtable_del(self, key: bytes): + row = self._cache_get(key)[0] + if row is None: + row = self.bt_table.direct_row(key) if not self.mutation_buffer_enabled: row.delete() row.commit() - else: - row = self._cache_get(key)[0] - if row is None: - row = self.bt_table.direct_row(key) self._cache_del(key, row) def _maybe_get_partition_from_message(self) -> Optional[int]: From 090eb6452ec370f2704690e7513efac6faa595ed Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 16:33:04 +0200 Subject: [PATCH 120/616] fill startup cache --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 58fea85ed..36a07217f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -212,7 +212,7 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: value = None if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) - if self._cache: + if self._cache is not None: if self.value_cache_type == "startup": partition = key[0] if not self._cache.filled(partition): @@ -544,7 +544,7 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: if self.value_cache_type is "startup": self.log.info( f"Current size of ValueCache for {self.table_name}" - f" is:{self._cache.data}" + f" is:{len(self._cache.data)}" ) offset_key = self.get_offset_key(tp).encode() self._bigtable_set( From ed60dbafcecae3d109e4f73384bc80a6abd765f5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 29 Sep 2022 16:35:54 +0200 Subject: [PATCH 121/616] better logging --- faust/stores/bigtable.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 36a07217f..4266acf53 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -535,12 +535,13 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: if self.mutation_buffer_enabled and not recovery: if self._mutation_buffer.full(): num_mutations = len(self._mutation_buffer.rows) - self._mutation_buffer.flush() - self.log.info( - f"Flushed BigtableMutationBuffer with {num_mutations} " - f"mutations for table {self.table_name}" + f"Will flush BigtableMutationBuffer with {num_mutations} " + f"mutations for table {self.table_name}..." ) + self._mutation_buffer.flush() + self.log.info("Flushed BigtableMutationBuffer") + if self.value_cache_type is "startup": self.log.info( f"Current size of ValueCache for {self.table_name}" From f7bf0e6d587404d417b4b5b899faa4a1e33dc2e0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 08:41:46 +0200 Subject: [PATCH 122/616] logging --- faust/stores/bigtable.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4266acf53..6877481c2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,7 +7,6 @@ Dict, Iterable, Iterator, - List, Optional, Tuple, Union, @@ -47,7 +46,9 @@ def _apply_mutations(self) -> None: if val is None: row.delete() else: - column_family = list(self.bigtable_table.list_column_families().keys())[0] + column_family = list(self.bigtable_table.list_column_families().keys())[ + 0 + ] row.set_cell( column_family, "DATA", @@ -73,7 +74,6 @@ class BigtableStartupCache: successful access to a key, will remove it. """ - def __init__(self) -> None: self.log = logging.getLogger(self.__class__.__name__) self._filled_partitions = {} @@ -101,19 +101,14 @@ def filled(self, partition): def fill(self, table, partition) -> None: start_key = partition.to_bytes(1, "little") end_key = (partition + 1).to_bytes(1, "little") - self.log.info( - f"Will fill BigtableStartupCache with {len(self.data)}..." - ) + self.log.info(f"Will fill BigtableStartupCache with {len(self.data)}...") for row in table.read_rows( start_key=start_key, end_key=end_key, ): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self.log.info( - f"Filled BigtableStartupCache with {len(self.data)} " - "entries" - ) + self.log.info(f"Filled BigtableStartupCache with {len(self.data)} entries") self._filled_partitions[partition] = True @@ -218,11 +213,6 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: if not self._cache.filled(partition): self._cache.fill(self.bt_table, partition) if key in self._cache.keys(): - self.log.info( - f"Took value from startup cache, " - f"remaining size: {len(self._cache.data)} " # TODO: REMOVE - f"for table {self.table_name}:{partition}" - ) value = self._cache[key] return row, value From 1a43170f739fce91609cc457cd35a34580f6e8e2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 09:19:04 +0200 Subject: [PATCH 123/616] add traceback to crash exception --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6877481c2..3198c733c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -549,7 +549,8 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" - " -> will crash faust app!" + " -> will crash faust app! " + f"TRACEBACK: {traceback.format_exc()}" ) self.app._crash(e) From 5156d6f09c2f855758f0cd39551b0e17be4bbb81 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 09:25:03 +0200 Subject: [PATCH 124/616] adjusted apply mutation function --- faust/stores/bigtable.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3198c733c..6597f2f47 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -54,6 +54,7 @@ def _apply_mutations(self) -> None: "DATA", val, ) + self.rows[row.row_key] = (row, val) def full(self) -> bool: return len(self.rows) >= self.mutation_limit @@ -264,11 +265,12 @@ def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False row = self._cache_get(key)[0] if row is None: row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) + if not self.mutation_buffer_enabled: + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) self._cache_set(key, row, value) else: row = self.bt_table.direct_row(key) From 5734885d1f97eae42fcac67a5a9ac9d4ef5cdbb1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 09:29:05 +0200 Subject: [PATCH 125/616] adjusted flush function --- faust/stores/bigtable.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6597f2f47..59b9633b2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,6 +7,7 @@ Dict, Iterable, Iterator, + List, Optional, Tuple, Union, @@ -41,20 +42,20 @@ def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: self.bigtable_table: Table = bigtable_table self.rows = {} - def _apply_mutations(self) -> None: + def flush(self) -> None: + mutated_rows = [] for row, val in self.rows.values(): if val is None: row.delete() else: - column_family = list(self.bigtable_table.list_column_families().keys())[ - 0 - ] row.set_cell( - column_family, + "FaustColumnFamily", "DATA", val, ) - self.rows[row.row_key] = (row, val) + mutated_rows.append(row) + self.bigtable_table.mutate_rows(mutated_rows) + self.rows.clear() def full(self) -> bool: return len(self.rows) >= self.mutation_limit @@ -62,12 +63,6 @@ def full(self) -> bool: def submit(self, row: DirectRow, value: Optional[bytes] = None): self.rows[row.row_key] = row, value - def flush(self): - rows = list(zip(*self.rows.values()))[0] - self._apply_mutations() - self.bigtable_table.mutate_rows(rows) - self.rows.clear() - class BigtableStartupCache: """ From fcaa2bdc8cb49d5423db5453bf67e9c2b231f438 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 10:59:14 +0200 Subject: [PATCH 126/616] removed logs --- faust/stores/bigtable.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 59b9633b2..3a0cb0d93 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,7 +7,6 @@ Dict, Iterable, Iterator, - List, Optional, Tuple, Union, @@ -97,14 +96,12 @@ def filled(self, partition): def fill(self, table, partition) -> None: start_key = partition.to_bytes(1, "little") end_key = (partition + 1).to_bytes(1, "little") - self.log.info(f"Will fill BigtableStartupCache with {len(self.data)}...") for row in table.read_rows( start_key=start_key, end_key=end_key, ): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self.log.info(f"Filled BigtableStartupCache with {len(self.data)} entries") self._filled_partitions[partition] = True @@ -208,6 +205,10 @@ def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: partition = key[0] if not self._cache.filled(partition): self._cache.fill(self.bt_table, partition) + self.log.info( + f"Filled BigtableStartupCache for {self.table_name}" + f":{partition}, with {len(self._cache.data)} keys" + ) if key in self._cache.keys(): value = self._cache[key] return row, value @@ -527,9 +528,7 @@ def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: f"mutations for table {self.table_name}..." ) self._mutation_buffer.flush() - self.log.info("Flushed BigtableMutationBuffer") - - if self.value_cache_type is "startup": + if self.value_cache_type == "startup": self.log.info( f"Current size of ValueCache for {self.table_name}" f" is:{len(self._cache.data)}" From 8829defbdfdd0c6bc0bbfc730d97031612168fbe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 30 Sep 2022 12:30:27 +0200 Subject: [PATCH 127/616] optimization in _contains --- faust/stores/bigtable.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3a0cb0d93..7fc8721e9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -461,6 +461,8 @@ def _contains(self, key: bytes) -> bool: if res is not None: return True else: + self.log.info("Searching all partittions in _contains") + # First we want to check all caches for partition in self._partitions_for_key(key): key = self._get_key_with_partition(key, partition=partition) row, val = self._cache_get(key) @@ -468,6 +470,9 @@ def _contains(self, key: bytes) -> bool: return False elif val is not None: return True + # Now search the real table + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition(key, partition=partition) res = self.bt_table.read_row(key, filter_=self.row_filter) if res is not None: return True From 44288ef33a0bfc406c38359fe4ce7d9e49631497 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Oct 2022 11:47:46 +0200 Subject: [PATCH 128/616] removed _contains method for bigtable --- faust/stores/bigtable.py | 43 ++-------------------------------------- 1 file changed, 2 insertions(+), 41 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7fc8721e9..76a75a783 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -444,47 +444,8 @@ def _size(self) -> int: return 0 def _contains(self, key: bytes) -> bool: - try: - partition = self._maybe_get_partition_from_message() - if partition is not None: - key = self._get_key_with_partition( - key, - partition=partition, - ) - row, val = self._cache_get(key) - if row is not None and val is None: - return False - elif val is not None: - return True - - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is not None: - return True - else: - self.log.info("Searching all partittions in _contains") - # First we want to check all caches - for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) - row, val = self._cache_get(key) - if row is not None and val is None: - return False - elif val is not None: - return True - # Now search the real table - for partition in self._partitions_for_key(key): - key = self._get_key_with_partition(key, partition=partition) - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is not None: - return True - return False - except Exception as ex: - self.log.error( - f"FaustBigtableException Error in _contains for table " - f"{self.table_name} exception " - f"{ex} key {key}. " - f"Traceback: {traceback.format_exc()}" - ) - raise ex + # NOT IMPLEMENTED FOR BIGTABLE + return False def _clear(self) -> None: """This is typically used to clear data. From e5534e086e04497088dd1358069f756b5395eef9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 11 Oct 2022 15:05:30 +0200 Subject: [PATCH 129/616] logging to find bug, of missing data --- faust/stores/bigtable.py | 49 +++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 76a75a783..a5f505390 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -155,14 +155,18 @@ def _set_options(self, app, options) -> None: self.bt_start_key, self.bt_end_key = options.get( BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) - self.value_cache_type = options.get(BigTableStore.VALUE_CACHE_TYPE_KEY, None) + self.value_cache_type = options.get( + BigTableStore.VALUE_CACHE_TYPE_KEY, None + ) self.value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size ) self.mutation_buffer_enabled = options.get( BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False ) - self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") + self.column_name = options.get( + BigTableStore.BT_COLUMN_NAME_KEY, "DATA" + ) self.row_filter = options.get( BigTableStore.BT_ROW_FILTERS_KEY, CellsColumnLimitFilter(1) ) @@ -172,8 +176,12 @@ def _set_options(self, app, options) -> None: def _setup_mutation_buffer(self, options) -> None: if self.mutation_buffer_enabled: - limit = options.get(BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100) - self._mutation_buffer = BigtableMutationBuffer(self.bt_table, limit) + limit = options.get( + BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 + ) + self._mutation_buffer = BigtableMutationBuffer( + self.bt_table, limit + ) def _setup_value_cache(self) -> None: if self.value_cache_type == "startup": @@ -181,7 +189,9 @@ def _setup_value_cache(self) -> None: elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: - raise NotImplementedError(f"VALUE_CACHE_TYPE '{self.value_cache_type}'") + raise NotImplementedError( + f"VALUE_CACHE_TYPE '{self.value_cache_type}'" + ) def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: if self._cache is not None: @@ -195,7 +205,9 @@ def _cache_del(self, key: bytes, row: DirectRow) -> None: if self._cache: del self._cache[key] - def _cache_get(self, key: bytes) -> Tuple[Optional[DirectRow], Optional[bytes]]: + def _cache_get( + self, key: bytes + ) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None if self.mutation_buffer_enabled: @@ -245,6 +257,7 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes): + self.log.info(f"called _bigtable_get with {key=}") row, value = self._cache_get(key) if value is not None: return value @@ -253,10 +266,13 @@ def _bigtable_get(self, key: bytes): else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: + self.log.info(f"{key=} not found in {self.table_name}") return None return self.bigtable_exrtact_row_data(res) - def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): + def _bigtable_set( + self, key: bytes, value: Optional[bytes], persist_offset=False + ): if not persist_offset: row = self._cache_get(key)[0] if row is None: @@ -309,9 +325,6 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: return range(self.app.conf.topic_partitions) def _get(self, key: bytes) -> Optional[bytes]: - # If the cache was not yet initialised, we want to do it here. - # This function will immediately abort if no cache is set - try: partition = self._maybe_get_partition_from_message() if partition is not None: @@ -322,6 +335,9 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: self._key_index[key] = partition return value + self.log.warning( + f"{key=} not found in {self.table_name} on {partition=}" + ) else: for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( @@ -332,9 +348,12 @@ def _get(self, key: bytes) -> Optional[bytes]: self._key_index[key] = partition return value # No key was found + self.log.warning(f"{key=} not found in {self.table_name}") return None except KeyError as ke: - self.log.error(f"KeyError in get for table {self.table_name} for {key=}") + self.log.error( + f"KeyError in get for table {self.table_name} for {key=}" + ) raise ke except Exception as ex: self.log.error( @@ -345,7 +364,9 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) self._bigtable_set(key_with_partition, value) self._key_index[key] = partition except Exception as ex: @@ -477,7 +498,9 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return offset return None - def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: + def set_persisted_offset( + self, tp: TP, offset: int, recovery=False + ) -> None: """Set the last persisted offset for this table. This will remember the last offset that we wrote to BigTableStore, From aaba46d39856d9df42fb992195c606d8597121ff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 12 Oct 2022 09:32:29 +0200 Subject: [PATCH 130/616] added additional logging for get and set --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a5f505390..da4bf594a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -210,6 +210,7 @@ def _cache_get( ) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None + self.log.info(f"called _bigtable_get with {key=}") if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) if self._cache is not None: @@ -223,6 +224,8 @@ def _cache_get( ) if key in self._cache.keys(): value = self._cache[key] + if row is None and value is None: + self.log.info(f"Cache miss for {key=} in {self.table_name}") return row, value def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -274,6 +277,7 @@ def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): if not persist_offset: + self.log.info(f"called _bigtable_set with {key=}") row = self._cache_get(key)[0] if row is None: row = self.bt_table.direct_row(key) From 1ce2f3e0aa419b2718210744c39119e23eaceb2c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 12 Oct 2022 15:52:35 +0200 Subject: [PATCH 131/616] added more logging and changed the iteritems method to skip rows that have a delete mutations as a last mutation --- faust/stores/bigtable.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index da4bf594a..b24e1603b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -210,7 +210,6 @@ def _cache_get( ) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None - self.log.info(f"called _bigtable_get with {key=}") if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) if self._cache is not None: @@ -223,7 +222,11 @@ def _cache_get( f":{partition}, with {len(self._cache.data)} keys" ) if key in self._cache.keys(): - value = self._cache[key] + value_cache = self._cache[key] + if value is not None and value != value_cache: + self.log.error( + f"Cache inconsintency, mut_buf:{value}, cache{value_cache}" + ) if row is None and value is None: self.log.info(f"Cache miss for {key=} in {self.table_name}") return row, value @@ -260,13 +263,14 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes): - self.log.info(f"called _bigtable_get with {key=}") + self.log.info(f"called _cache_get with {key=}") row, value = self._cache_get(key) if value is not None: return value elif row is not None and value is None: return value else: + self.log.info(f"called bigtable.read_row with {key=}") res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: self.log.info(f"{key=} not found in {self.table_name}") @@ -445,13 +449,15 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: end_key=end_key, ): if self.mutation_buffer_enabled: - # We want to yield the mutation if any us buffered + # We want to yield the mutation if any is buffered mut_row, value = self._mutation_buffer.rows.get( row.row_key, (None, None) ) - if mut_row is not None and value is not None: + if value is not None: yield (row.row_key[1:], value) continue + elif mut_row is not None: + continue yield ( row.row_key[1:], self.bigtable_exrtact_row_data(row), From 29550e9203ab9658e4bbbf458e0a82fee1ddf4ff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 09:13:24 +0200 Subject: [PATCH 132/616] added more logs --- faust/stores/bigtable.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b24e1603b..6cb10278a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -227,8 +227,6 @@ def _cache_get( self.log.error( f"Cache inconsintency, mut_buf:{value}, cache{value_cache}" ) - if row is None and value is None: - self.log.info(f"Cache miss for {key=} in {self.table_name}") return row, value def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -263,7 +261,7 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes): - self.log.info(f"called _cache_get with {key=}") + self.log.info(f"called _cache_get with {key=} in {self.table_name}") row, value = self._cache_get(key) if value is not None: return value @@ -281,7 +279,7 @@ def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): if not persist_offset: - self.log.info(f"called _bigtable_set with {key=}") + self.log.info(f"called _bigtable_set with {key=} in {self.table_name}") row = self._cache_get(key)[0] if row is None: row = self.bt_table.direct_row(key) @@ -334,11 +332,13 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: + self.log.info(f"called _get with {key=} in {self.table_name}") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( key, partition=partition ) + self.log.info(f"will call _bigtable_get with {key=} in {self.table_name}:{partition}") value = self._bigtable_get(key_with_partition) if value is not None: self._key_index[key] = partition @@ -347,6 +347,7 @@ def _get(self, key: bytes) -> Optional[bytes]: f"{key=} not found in {self.table_name} on {partition=}" ) else: + self.log.info(f"will call _bigtable_get with {key=} in {self.table_name}:all") for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition From fc31a1fd9c1300d1bee2c00547c0c6882544c446 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 09:15:33 +0200 Subject: [PATCH 133/616] more logs --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6cb10278a..52031d8e5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -376,6 +376,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: key_with_partition = self._get_key_with_partition( key, partition=partition ) + self.log.info(f"called _set with {key=} in {self.table_name}:{partition}") self._bigtable_set(key_with_partition, value) self._key_index[key] = partition except Exception as ex: From a84cbe2697d2dc9965795aabd345c489d8e0b4c8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 10:44:27 +0200 Subject: [PATCH 134/616] always have key in store --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 52031d8e5..92cb81f4e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -478,7 +478,7 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: # NOT IMPLEMENTED FOR BIGTABLE - return False + return True def _clear(self) -> None: """This is typically used to clear data. From b152521feebfc4128a5198a0069c0eabdc005df9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 13:57:38 +0200 Subject: [PATCH 135/616] added contains method again --- faust/stores/bigtable.py | 53 +++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 92cb81f4e..7b54f83c4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -261,14 +261,12 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes): - self.log.info(f"called _cache_get with {key=} in {self.table_name}") row, value = self._cache_get(key) if value is not None: return value elif row is not None and value is None: return value else: - self.log.info(f"called bigtable.read_row with {key=}") res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: self.log.info(f"{key=} not found in {self.table_name}") @@ -279,7 +277,6 @@ def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): if not persist_offset: - self.log.info(f"called _bigtable_set with {key=} in {self.table_name}") row = self._cache_get(key)[0] if row is None: row = self.bt_table.direct_row(key) @@ -332,13 +329,11 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - self.log.info(f"called _get with {key=} in {self.table_name}") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( key, partition=partition ) - self.log.info(f"will call _bigtable_get with {key=} in {self.table_name}:{partition}") value = self._bigtable_get(key_with_partition) if value is not None: self._key_index[key] = partition @@ -347,7 +342,6 @@ def _get(self, key: bytes) -> Optional[bytes]: f"{key=} not found in {self.table_name} on {partition=}" ) else: - self.log.info(f"will call _bigtable_get with {key=} in {self.table_name}:all") for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition @@ -376,7 +370,6 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: key_with_partition = self._get_key_with_partition( key, partition=partition ) - self.log.info(f"called _set with {key=} in {self.table_name}:{partition}") self._bigtable_set(key_with_partition, value) self._key_index[key] = partition except Exception as ex: @@ -477,8 +470,50 @@ def _size(self) -> int: return 0 def _contains(self, key: bytes) -> bool: - # NOT IMPLEMENTED FOR BIGTABLE - return True + try: + partition = self._maybe_get_partition_from_message() + if partition is not None: + key = self._get_key_with_partition( + key, + partition=partition, + ) + row, val = self._cache_get(key) + if val is not None: + return True + elif row is not None: + return False + else: + res = self.bt_table.read_row(key, filter_=self.row_filter) + return res is not None + else: + self.log.info("Searching all partittions in _contains") + # First we want to check all caches + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition( + key, partition=partition + ) + row, val = self._cache_get(key) + if row is not None and val is None: + return False + elif val is not None: + return True + # Now search the real table + for partition in self._partitions_for_key(key): + key = self._get_key_with_partition( + key, partition=partition + ) + res = self.bt_table.read_row(key, filter_=self.row_filter) + if res is not None: + return True + return False + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in _contains for table " + f"{self.table_name} exception " + f"{ex} key {key}. " + f"Traceback: {traceback.format_exc()}" + ) + raise ex def _clear(self) -> None: """This is typically used to clear data. From 8aaad614a604663811c4d344d4d8ae1c16fa02c6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 14:25:54 +0200 Subject: [PATCH 136/616] implemented ttl for startupcache --- faust/stores/bigtable.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7b54f83c4..3af927fe8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,5 +1,6 @@ """BigTable storage.""" import logging +import time import traceback from typing import ( Any, @@ -69,27 +70,40 @@ class BigtableStartupCache: successful access to a key, will remove it. """ - def __init__(self) -> None: + def __init__(self, ttl: Optional[int]) -> None: self.log = logging.getLogger(self.__class__.__name__) self._filled_partitions = {} self.data: Dict = {} - - def keys(self): - return self.data.keys() + self.ttl = ttl + self.init_ts = int(time.time()) def __len__(self): return len(self.data) def __getitem__(self, key): - return self.data.pop(key, None) + res = self.data.pop(key, None) + self._maybe_ttl_clear() + return res def __setitem__(self, key, _) -> None: if key in self.data.keys(): self.data.pop(key, None) + self._maybe_ttl_clear() def __delitem__(self, key): self.data.pop(key, None) + def _maybe_ttl_clear(self): + if self.ttl is not None: + now = int(time.time()) + if now > self.init_ts + self.ttl: + self.data = {} + self.ttl = None + self.log.info("Cleard startupcache because TTL is over") + + def keys(self): + return self.data.keys() + def filled(self, partition): return self._filled_partitions.get(partition, False) @@ -117,6 +131,7 @@ class BigTableStore(base.SerializedStore): _mutation_buffer: Optional[BigtableMutationBuffer] VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" + STARTUPCACHE_TTL_KEY = "startupcache_ttl_key" BT_PROJECT_KEY = "bt_project_key" BT_INSTANCE_KEY = "bt_instance_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" @@ -142,7 +157,7 @@ def __init__( try: self._bigtable_setup(table, options) self._setup_mutation_buffer(options) - self._setup_value_cache() + self._setup_value_cache(options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -183,9 +198,12 @@ def _setup_mutation_buffer(self, options) -> None: self.bt_table, limit ) - def _setup_value_cache(self) -> None: + def _setup_value_cache(self, options) -> None: if self.value_cache_type == "startup": - self._cache = BigtableStartupCache() + startup_cache_ttl = options.get( + BigTableStore.STARTUPCACHE_TTL_KEY, None + ) + self._cache = BigtableStartupCache(startup_cache_ttl) elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: From 64cf5e3d6091ef7b56c3f1cb069140c3ed13d9bb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 15:04:57 +0200 Subject: [PATCH 137/616] better contains function --- faust/stores/bigtable.py | 45 +++++++--------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3af927fe8..ec081d0e0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -286,10 +286,15 @@ def _bigtable_get(self, key: bytes): return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) + row = self.bt_table.direct_row(key) if res is None: self.log.info(f"{key=} not found in {self.table_name}") - return None - return self.bigtable_exrtact_row_data(res) + value = None + else: + value: bytes = self.bigtable_exrtact_row_data(res) + # FIXME: This is just a hack to abuse the mutation buffer + # as a shortterm get cache + self._cache_set(key, row, value) def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False @@ -489,41 +494,7 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - partition = self._maybe_get_partition_from_message() - if partition is not None: - key = self._get_key_with_partition( - key, - partition=partition, - ) - row, val = self._cache_get(key) - if val is not None: - return True - elif row is not None: - return False - else: - res = self.bt_table.read_row(key, filter_=self.row_filter) - return res is not None - else: - self.log.info("Searching all partittions in _contains") - # First we want to check all caches - for partition in self._partitions_for_key(key): - key = self._get_key_with_partition( - key, partition=partition - ) - row, val = self._cache_get(key) - if row is not None and val is None: - return False - elif val is not None: - return True - # Now search the real table - for partition in self._partitions_for_key(key): - key = self._get_key_with_partition( - key, partition=partition - ) - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is not None: - return True - return False + return self._get(key) is not None except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From f1c95df31d38ab9644c90af1c6e2b00e45712207 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 13 Oct 2022 15:46:19 +0200 Subject: [PATCH 138/616] fixed bigtable get --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ec081d0e0..48271d779 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -295,6 +295,7 @@ def _bigtable_get(self, key: bytes): # FIXME: This is just a hack to abuse the mutation buffer # as a shortterm get cache self._cache_set(key, row, value) + return value def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False From 860bd2a729e9145378e585d1decbb749c4a8d843 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 10:35:38 +0200 Subject: [PATCH 139/616] implemented keycache --- faust/stores/bigtable.py | 67 +++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 48271d779..0f01ab1e0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -107,16 +107,12 @@ def keys(self): def filled(self, partition): return self._filled_partitions.get(partition, False) - def fill(self, table, partition) -> None: - start_key = partition.to_bytes(1, "little") - end_key = (partition + 1).to_bytes(1, "little") - for row in table.read_rows( - start_key=start_key, - end_key=end_key, - ): + def fill(self, table, offset_key_prefix) -> None: + for row in table.read_rows(): + if offset_key_prefix in row.row_key: + continue row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self._filled_partitions[partition] = True class BigTableStore(base.SerializedStore): @@ -129,6 +125,7 @@ class BigTableStore(base.SerializedStore): _key_index: LRUCache[bytes, int] _cache: Optional[Union[LRUCache[bytes, bytes], BigtableStartupCache]] _mutation_buffer: Optional[BigtableMutationBuffer] + KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" VALUE_CACHE_TYPE_KEY = "value_cache_type_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" STARTUPCACHE_TTL_KEY = "startupcache_ttl_key" @@ -153,17 +150,22 @@ def __init__( self._set_options(app, options) self._key_index = LRUCache(limit=app.conf.table_key_index_size) self._cache = None + self._key_cache = None self._mutation_buffer = None try: self._bigtable_setup(table, options) self._setup_mutation_buffer(options) - self._setup_value_cache(options) + self._setup_key_and_value_cache(options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) def _set_options(self, app, options) -> None: + self.key_cache_enabled = options.get( + BigTableStore.KEY_CACHE_ENABLE_KEY, False + ) + self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) @@ -198,30 +200,51 @@ def _setup_mutation_buffer(self, options) -> None: self.bt_table, limit ) - def _setup_value_cache(self, options) -> None: + def _setup_key_and_value_cache(self, options) -> None: if self.value_cache_type == "startup": startup_cache_ttl = options.get( BigTableStore.STARTUPCACHE_TTL_KEY, None ) self._cache = BigtableStartupCache(startup_cache_ttl) + start = time.time() + self._cache.fill(self.bt_table, self.offset_key_prefix) + end = time.time() + + self.log.info( + f"Filled BigtableStartupCache for {self.table_name}" + f" in {start-end}s" + ) + if self.key_cache_enabled: + self._key_cache = set(self._cache.keys()) + elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: raise NotImplementedError( f"VALUE_CACHE_TYPE '{self.value_cache_type}'" ) + if self.key_cache_enabled and self._key_cache is None: + self._key_cache = set() + for row in self.bt_table.read_rows(): + if self.offset_key_prefix in row.row_key: + continue + self._key_cache.add(row.row_key) def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: if self._cache is not None: self._cache[key] = value if self.mutation_buffer_enabled: self._mutation_buffer.submit(row, value) + if self._key_cache: + self._key_cache.add(key) def _cache_del(self, key: bytes, row: DirectRow) -> None: if self.mutation_buffer_enabled: self._mutation_buffer.submit(row, None) if self._cache: del self._cache[key] + if self._key_cache: + self._key_cache.discard(key) def _cache_get( self, key: bytes @@ -293,7 +316,7 @@ def _bigtable_get(self, key: bytes): else: value: bytes = self.bigtable_exrtact_row_data(res) # FIXME: This is just a hack to abuse the mutation buffer - # as a shortterm get cache + # as a shortterm get cache self._cache_set(key, row, value) return value @@ -495,7 +518,27 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - return self._get(key) is not None + partition = self._maybe_get_partition_from_message() + if partition is not None: + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) + if self._key_cache: + return key_with_partition in self._key_cache + else: + return self._bigtable_get(key_with_partition) is not None + else: + for partition in self._partitions_for_key(key): + key_with_partition = self._get_key_with_partition( + key, partition=partition + ) + if self._key_cache: + return key_with_partition in self._key_cache + else: + return ( + self._bigtable_get(key_with_partition) is not None + ) + return False except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From c6b16ab70d84da0ba3cb1b176d8dd94eeaf245e9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 10:49:39 +0200 Subject: [PATCH 140/616] fixed encoding of offset key --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0f01ab1e0..d076f4dd2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -207,7 +207,7 @@ def _setup_key_and_value_cache(self, options) -> None: ) self._cache = BigtableStartupCache(startup_cache_ttl) start = time.time() - self._cache.fill(self.bt_table, self.offset_key_prefix) + self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) end = time.time() self.log.info( @@ -225,8 +225,9 @@ def _setup_key_and_value_cache(self, options) -> None: ) if self.key_cache_enabled and self._key_cache is None: self._key_cache = set() + offset_prefix = self.offset_key_prefix.encode() for row in self.bt_table.read_rows(): - if self.offset_key_prefix in row.row_key: + if offset_prefix in row.row_key: continue self._key_cache.add(row.row_key) From 58c9011700713452bf9840a22d1c95925594febd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 11:25:57 +0200 Subject: [PATCH 141/616] fixed missing encoding --- faust/stores/bigtable.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d076f4dd2..367dde4ef 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -104,9 +104,6 @@ def _maybe_ttl_clear(self): def keys(self): return self.data.keys() - def filled(self, partition): - return self._filled_partitions.get(partition, False) - def fill(self, table, offset_key_prefix) -> None: for row in table.read_rows(): if offset_key_prefix in row.row_key: @@ -255,14 +252,6 @@ def _cache_get( if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) if self._cache is not None: - if self.value_cache_type == "startup": - partition = key[0] - if not self._cache.filled(partition): - self._cache.fill(self.bt_table, partition) - self.log.info( - f"Filled BigtableStartupCache for {self.table_name}" - f":{partition}, with {len(self._cache.data)} keys" - ) if key in self._cache.keys(): value_cache = self._cache[key] if value is not None and value != value_cache: From ff416c46b78f70a128c46c3b89879c6a341bff3d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 11:29:44 +0200 Subject: [PATCH 142/616] fixed wrong logging --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 367dde4ef..ba71d1a3a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -206,8 +206,7 @@ def _setup_key_and_value_cache(self, options) -> None: start = time.time() self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) end = time.time() - - self.log.info( + logging.getLogger(__name__).info( f"Filled BigtableStartupCache for {self.table_name}" f" in {start-end}s" ) From 6e75529fde63668da7e4f7836e59504b0d09ba8f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 11:50:45 +0200 Subject: [PATCH 143/616] fixed logging --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ba71d1a3a..381270b96 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -207,7 +207,7 @@ def _setup_key_and_value_cache(self, options) -> None: self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) end = time.time() logging.getLogger(__name__).info( - f"Filled BigtableStartupCache for {self.table_name}" + "Filled BigtableStartupCache for" f" in {start-end}s" ) if self.key_cache_enabled: From e5ca51e4c69b391b0e66d90256240c061b9fb2fe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 13:16:42 +0200 Subject: [PATCH 144/616] adjusted logging --- faust/stores/bigtable.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 381270b96..db4354dc8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -207,8 +207,7 @@ def _setup_key_and_value_cache(self, options) -> None: self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) end = time.time() logging.getLogger(__name__).info( - "Filled BigtableStartupCache for" - f" in {start-end}s" + f"Filled BigtableStartupCache for in {start-end}s" ) if self.key_cache_enabled: self._key_cache = set(self._cache.keys()) @@ -304,9 +303,6 @@ def _bigtable_get(self, key: bytes): value = None else: value: bytes = self.bigtable_exrtact_row_data(res) - # FIXME: This is just a hack to abuse the mutation buffer - # as a shortterm get cache - self._cache_set(key, row, value) return value def _bigtable_set( @@ -512,7 +508,11 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self._key_cache: + if self._key_cache is not None: + self.log.info( + "Contains took value of key_cache " + "with size {self._key_cache}" + ) return key_with_partition in self._key_cache else: return self._bigtable_get(key_with_partition) is not None From 9b46491823945ab285e8d823e55f65c96a5cbd7c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 13:17:01 +0200 Subject: [PATCH 145/616] removed unused word --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index db4354dc8..c303cdd0b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -207,7 +207,7 @@ def _setup_key_and_value_cache(self, options) -> None: self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) end = time.time() logging.getLogger(__name__).info( - f"Filled BigtableStartupCache for in {start-end}s" + f"Filled BigtableStartupCache in {start-end}s" ) if self.key_cache_enabled: self._key_cache = set(self._cache.keys()) From e3a59f395efcc690ae25c31a1755f8d26bd566b4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 13:19:27 +0200 Subject: [PATCH 146/616] more logs again --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c303cdd0b..3090ff49e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -522,6 +522,10 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) if self._key_cache: + self.log.info( + "Contains took value of key_cache " + "with size {self._key_cache} for all partitions" + ) return key_with_partition in self._key_cache else: return ( From a83825f5dcb4b12da3a6e4d36ae6ec2b2bfe14bc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 13:30:43 +0200 Subject: [PATCH 147/616] use cache in reading recover changelogs --- faust/stores/bigtable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3090ff49e..ca7d3dcff 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -158,6 +158,7 @@ def __init__( raise ex super().__init__(url, app, table, **kwargs) + def _set_options(self, app, options) -> None: self.key_cache_enabled = options.get( BigTableStore.KEY_CACHE_ENABLE_KEY, False @@ -650,12 +651,14 @@ def apply_changelog_batch( row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() + self._cache_del(key, row) else: row.set_cell( self.column_family_id, self.column_name, msg.value, ) + self._cache_set(offset_key, row, msg.value) row_mutations.append(row) self._persist_changelog_batch( row_mutations, From cd56d7a3479cb905e3bf28d842ddabd7c421c33f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 14:12:10 +0200 Subject: [PATCH 148/616] better logging --- faust/stores/bigtable.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ca7d3dcff..31c34c517 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -432,10 +432,8 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - self.log.info(f"Started _iterkeys for {self.table_name}") for row in self._iteritems(): yield row[0] - self.log.info(f"Finished _iterkeys for {self.table_name}") except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -510,11 +508,8 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) if self._key_cache is not None: - self.log.info( - "Contains took value of key_cache " - "with size {self._key_cache}" - ) - return key_with_partition in self._key_cache + if key_with_partition in self._key_cache: + return True else: return self._bigtable_get(key_with_partition) is not None else: @@ -527,12 +522,18 @@ def _contains(self, key: bytes) -> bool: "Contains took value of key_cache " "with size {self._key_cache} for all partitions" ) - return key_with_partition in self._key_cache + if key_with_partition in self._key_cache: + return True else: return ( self._bigtable_get(key_with_partition) is not None ) - return False + if self._key_cache is not None: + self.log.info( + f"Contains miss with size {len(self._key_cache)}, " + f"for {key=} in table {self.table_name}" + ) + return False except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From ad619ffcfdb3bd4ee8ba2d50d7c42b24d29413f4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 14 Oct 2022 14:43:10 +0200 Subject: [PATCH 149/616] =?UTF-8?q?removed=20logs=20=F0=9F=9A=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 31c34c517..0dd97efb4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -518,21 +518,12 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) if self._key_cache: - self.log.info( - "Contains took value of key_cache " - "with size {self._key_cache} for all partitions" - ) if key_with_partition in self._key_cache: return True else: return ( self._bigtable_get(key_with_partition) is not None ) - if self._key_cache is not None: - self.log.info( - f"Contains miss with size {len(self._key_cache)}, " - f"for {key=} in table {self.table_name}" - ) return False except Exception as ex: self.log.error( From cae131775edc3b2ca8dae77141269542d6ed8419 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 15:27:28 +0200 Subject: [PATCH 150/616] added bigtable get range --- faust/stores/bigtable.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0dd97efb4..50455a13d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -9,6 +9,7 @@ Iterable, Iterator, Optional, + Set, Tuple, Union, ) @@ -17,6 +18,7 @@ from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row import DirectRow +from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache @@ -306,6 +308,24 @@ def _bigtable_get(self, key: bytes): value: bytes = self.bigtable_exrtact_row_data(res) return value + def _bigtable_get_range(self, keys: Set[bytes]) -> Tuple[bytes, Optional[bytes]]: + # first search cache: + for key in keys: + row, value = self._cache_get(key) + if value is not None: + return key, value + elif row is not None and value is None: + return key, value + + rows = RowSet() + for key in keys: + rows.add_row_key(key) + + for row in self.bt_table.read_rows(row_set=rows): + # First hit will return + return row.row_key, BigTableStore.bigtable_exrtact_row_data(row.data) + + def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): @@ -375,14 +395,17 @@ def _get(self, key: bytes) -> Optional[bytes]: f"{key=} not found in {self.table_name} on {partition=}" ) else: + keys = set() for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition ) - value = self._bigtable_get(key_with_partition) - if value is not None: - self._key_index[key] = partition - return value + keys.add(key_with_partition) + + key, value = self._bigtable_get_range(keys) + if value is not None: + self._key_index[key] = partition + return value # No key was found self.log.warning(f"{key=} not found in {self.table_name}") return None From 145d7b38f305aafdee051f3eb3391e201a4e1de3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 15:34:30 +0200 Subject: [PATCH 151/616] added partition to key index --- faust/stores/bigtable.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 50455a13d..efbbc1009 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -160,7 +160,6 @@ def __init__( raise ex super().__init__(url, app, table, **kwargs) - def _set_options(self, app, options) -> None: self.key_cache_enabled = options.get( BigTableStore.KEY_CACHE_ENABLE_KEY, False @@ -308,7 +307,9 @@ def _bigtable_get(self, key: bytes): value: bytes = self.bigtable_exrtact_row_data(res) return value - def _bigtable_get_range(self, keys: Set[bytes]) -> Tuple[bytes, Optional[bytes]]: + def _bigtable_get_range( + self, keys: Set[bytes] + ) -> Tuple[bytes, Optional[bytes]]: # first search cache: for key in keys: row, value = self._cache_get(key) @@ -323,8 +324,9 @@ def _bigtable_get_range(self, keys: Set[bytes]) -> Tuple[bytes, Optional[bytes]] for row in self.bt_table.read_rows(row_set=rows): # First hit will return - return row.row_key, BigTableStore.bigtable_exrtact_row_data(row.data) - + return row.row_key, BigTableStore.bigtable_exrtact_row_data( + row.data + ) def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False @@ -404,7 +406,8 @@ def _get(self, key: bytes) -> Optional[bytes]: key, value = self._bigtable_get_range(keys) if value is not None: - self._key_index[key] = partition + partition = key[1] + self._key_index[key[1:]] = partition return value # No key was found self.log.warning(f"{key=} not found in {self.table_name}") From c55ce65400d47d974bcbf6dd1566788176c40152 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 15:48:51 +0200 Subject: [PATCH 152/616] added key_cache to get --- faust/stores/bigtable.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index efbbc1009..f18f49242 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -383,6 +383,12 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: return range(self.app.conf.topic_partitions) def _get(self, key: bytes) -> Optional[bytes]: + if self._key_cache: + if key not in self._key_cache: + self.log.info( + "Key was not found in key_cache, will return None" + ) + return None try: partition = self._maybe_get_partition_from_message() if partition is not None: From f28903f4cd6c78967d8af0f2f06268c43aeca6ec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 16:04:16 +0200 Subject: [PATCH 153/616] fixed row accesss --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f18f49242..ea695633c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -325,7 +325,7 @@ def _bigtable_get_range( for row in self.bt_table.read_rows(row_set=rows): # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data( - row.data + row ) def _bigtable_set( From 0b4ce183992d493eaa06ad4802af7de0748a7e84 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 16:47:57 +0200 Subject: [PATCH 154/616] fixed key cache --- faust/stores/bigtable.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ea695633c..9bf1250f6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -382,19 +382,23 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: except KeyError: return range(self.app.conf.topic_partitions) - def _get(self, key: bytes) -> Optional[bytes]: + def _check_key_cache(self, key): if self._key_cache: - if key not in self._key_cache: - self.log.info( - "Key was not found in key_cache, will return None" - ) - return None + return key in self._key_cache + else: + return False + + def _get(self, key: bytes) -> Optional[bytes]: try: partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( key, partition=partition ) + if self.key_cache_enabled: + if self._check_key_cache(key_with_partition): + return None + value = self._bigtable_get(key_with_partition) if value is not None: self._key_index[key] = partition @@ -408,7 +412,14 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition = self._get_key_with_partition( key, partition=partition ) - keys.add(key_with_partition) + if self.key_cache_enabled: + if self._check_key_cache(key): + keys.add(key_with_partition) + else: + keys.add(key_with_partition) + + if len(keys) == 0: + return None key, value = self._bigtable_get_range(keys) if value is not None: @@ -539,9 +550,8 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self._key_cache is not None: - if key_with_partition in self._key_cache: - return True + if self.key_cache_enabled: + return self._check_key_cache(key_with_partition) else: return self._bigtable_get(key_with_partition) is not None else: @@ -549,8 +559,8 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self._key_cache: - if key_with_partition in self._key_cache: + if self.key_cache_enabled: + if self._check_key_cache(key_with_partition): return True else: return ( From e444de15ab5c3743f1748c11e170c3f4609c46c1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 17 Oct 2022 17:10:06 +0200 Subject: [PATCH 155/616] removed check_keys from get becaus its called in _contains --- faust/stores/bigtable.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9bf1250f6..cb93f605e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -395,9 +395,6 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self.key_cache_enabled: - if self._check_key_cache(key_with_partition): - return None value = self._bigtable_get(key_with_partition) if value is not None: @@ -412,14 +409,7 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self.key_cache_enabled: - if self._check_key_cache(key): - keys.add(key_with_partition) - else: - keys.add(key_with_partition) - - if len(keys) == 0: - return None + keys.add(key_with_partition) key, value = self._bigtable_get_range(keys) if value is not None: From 53b692560260137b910c7824e1b4f021ee750128 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 20 Oct 2022 09:17:13 +0200 Subject: [PATCH 156/616] set key index after _contains --- faust/stores/bigtable.py | 8 ++++++-- scripts/install | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cb93f605e..c80f60693 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -540,10 +540,13 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) + found = False if self.key_cache_enabled: - return self._check_key_cache(key_with_partition) + found = self._check_key_cache(key_with_partition) else: - return self._bigtable_get(key_with_partition) is not None + found = self._bigtable_get(key_with_partition) is not None + if found: + self._key_index[key] = partition else: for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( @@ -551,6 +554,7 @@ def _contains(self, key: bytes) -> bool: ) if self.key_cache_enabled: if self._check_key_cache(key_with_partition): + self._key_index[key] = partition return True else: return ( diff --git a/scripts/install b/scripts/install index c0e202e86..3f4390bc4 100755 --- a/scripts/install +++ b/scripts/install @@ -4,7 +4,7 @@ [ "$1" = "-p" ] && PYTHON=$2 || PYTHON="python3" REQUIREMENTS="requirements/test.txt" -VENV="venv" +VENV="env-faust" set -x From a9cf2834958e877b370d225dd63227c51ed851c3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 21 Oct 2022 08:44:47 +0200 Subject: [PATCH 157/616] return correct value --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c80f60693..175b5b38f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -547,6 +547,7 @@ def _contains(self, key: bytes) -> bool: found = self._bigtable_get(key_with_partition) is not None if found: self._key_index[key] = partition + return found else: for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( From 4919da649dcc5ee8cdf1062ff0ff81e4fb518e82 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 21 Oct 2022 13:13:01 +0200 Subject: [PATCH 158/616] adjusted startup cache to just invalidate after some time --- faust/stores/bigtable.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 175b5b38f..00f2707d3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -74,23 +74,24 @@ class BigtableStartupCache: def __init__(self, ttl: Optional[int]) -> None: self.log = logging.getLogger(self.__class__.__name__) - self._filled_partitions = {} self.data: Dict = {} self.ttl = ttl + self.ttl_over = False self.init_ts = int(time.time()) def __len__(self): return len(self.data) def __getitem__(self, key): - res = self.data.pop(key, None) - self._maybe_ttl_clear() - return res + if self.ttl is not None: + res = self.data[key] + self._maybe_ttl_clear() + return res - def __setitem__(self, key, _) -> None: - if key in self.data.keys(): - self.data.pop(key, None) + def __setitem__(self, key, value) -> None: self._maybe_ttl_clear() + if self.ttl is not None: + self.data[key] = value def __delitem__(self, key): self.data.pop(key, None) @@ -324,9 +325,7 @@ def _bigtable_get_range( for row in self.bt_table.read_rows(row_set=rows): # First hit will return - return row.row_key, BigTableStore.bigtable_exrtact_row_data( - row - ) + return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False From cc8c8d6c52c4eea6e0aa126c5865a6df122bb0e4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 09:31:34 +0200 Subject: [PATCH 159/616] implemented timed mutation buffer --- faust/stores/bigtable.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 00f2707d3..37ab084df 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -39,7 +39,11 @@ class BigtableMutationBuffer: rows: Dict[bytes, Tuple[DirectRow, Optional[bytes]]] mutation_limit: int - def __init__(self, bigtable_table: Table, mutation_limit: int) -> None: + def __init__( + self, bigtable_table: Table, mutation_freq: int, mutation_limit: int + ) -> None: + self.mutation_freq: int = mutation_freq + self.last_flush = time.time() # set to now self.mutation_limit: int = mutation_limit self.bigtable_table: Table = bigtable_table self.rows = {} @@ -59,8 +63,10 @@ def flush(self) -> None: self.bigtable_table.mutate_rows(mutated_rows) self.rows.clear() - def full(self) -> bool: - return len(self.rows) >= self.mutation_limit + def check_flush(self) -> bool: + should_flush = (len(self.rows) >= self.mutation_limit) or ( + (self.last_flush + self.mutation_freq) < time.time() + ) def submit(self, row: DirectRow, value: Optional[bytes] = None): self.rows[row.row_key] = row, value @@ -125,19 +131,20 @@ class BigTableStore(base.SerializedStore): _key_index: LRUCache[bytes, int] _cache: Optional[Union[LRUCache[bytes, bytes], BigtableStartupCache]] _mutation_buffer: Optional[BigtableMutationBuffer] - KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" - VALUE_CACHE_TYPE_KEY = "value_cache_type_key" - VALUE_CACHE_SIZE_KEY = "value_cache_size_key" - STARTUPCACHE_TTL_KEY = "startupcache_ttl_key" - BT_PROJECT_KEY = "bt_project_key" + BT_COLUMN_NAME_KEY = "bt_column_name_key" + BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" BT_INSTANCE_KEY = "bt_instance_key" - BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" + BT_MUTATION_BUFFER_FREQ_KEY = "bt_mutation_buffer_freq_key" + BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" + BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" + BT_PROJECT_KEY = "bt_project_key" BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" - BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_ROW_FILTERS_KEY = "bt_row_filter_key" - BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" - BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" - BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" + BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" + KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" + STARTUPCACHE_TTL_KEY = "startupcache_ttl_key" + VALUE_CACHE_SIZE_KEY = "value_cache_size_key" + VALUE_CACHE_TYPE_KEY = "value_cache_type_key" def __init__( self, @@ -196,8 +203,11 @@ def _setup_mutation_buffer(self, options) -> None: limit = options.get( BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 ) + freq = options.get( + BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30*60 + ) self._mutation_buffer = BigtableMutationBuffer( - self.bt_table, limit + self.bt_table, ,limit ) def _setup_key_and_value_cache(self, options) -> None: @@ -612,7 +622,7 @@ def set_persisted_offset( """ try: if self.mutation_buffer_enabled and not recovery: - if self._mutation_buffer.full(): + if self._mutation_buffer.check_flush(): num_mutations = len(self._mutation_buffer.rows) self.log.info( f"Will flush BigtableMutationBuffer with {num_mutations} " From ffb0fbd7fc5a5a1faff7d0c66956a6da7ffe918b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 09:53:58 +0200 Subject: [PATCH 160/616] fixed syntax error --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 37ab084df..4a05a4d7f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -204,10 +204,10 @@ def _setup_mutation_buffer(self, options) -> None: BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 ) freq = options.get( - BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30*60 + BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30 * 60 ) self._mutation_buffer = BigtableMutationBuffer( - self.bt_table, ,limit + self.bt_table, freq, limit ) def _setup_key_and_value_cache(self, options) -> None: From 3c3f6682603d4ccac4a872fd37d15424333fe82a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 10:02:59 +0200 Subject: [PATCH 161/616] fixed should fush statement --- faust/stores/bigtable.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4a05a4d7f..162b5e94b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -62,11 +62,12 @@ def flush(self) -> None: mutated_rows.append(row) self.bigtable_table.mutate_rows(mutated_rows) self.rows.clear() + self.last_flush = time.time() # set to now def check_flush(self) -> bool: - should_flush = (len(self.rows) >= self.mutation_limit) or ( - (self.last_flush + self.mutation_freq) < time.time() - ) + limit_reached = len(self.rows) >= self.mutation_limit) + time_exceeded = self.last_flush + self.mutation_freq < time.time() + return limit_reached or time_exceeded def submit(self, row: DirectRow, value: Optional[bytes] = None): self.rows[row.row_key] = row, value From 53b8f813cb9b1b5efa06f65708f20405c2c225cb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 10:03:59 +0200 Subject: [PATCH 162/616] removed unused log --- faust/stores/bigtable.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 162b5e94b..2e8af85df 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -630,11 +630,6 @@ def set_persisted_offset( f"mutations for table {self.table_name}..." ) self._mutation_buffer.flush() - if self.value_cache_type == "startup": - self.log.info( - f"Current size of ValueCache for {self.table_name}" - f" is:{len(self._cache.data)}" - ) offset_key = self.get_offset_key(tp).encode() self._bigtable_set( offset_key, str(offset).encode(), persist_offset=True From 3eed475e7792829d4f1aee879c387b633698b07c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 10:21:27 +0200 Subject: [PATCH 163/616] fixed syntax error --- faust/stores/bigtable.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2e8af85df..7764e8040 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -43,7 +43,7 @@ def __init__( self, bigtable_table: Table, mutation_freq: int, mutation_limit: int ) -> None: self.mutation_freq: int = mutation_freq - self.last_flush = time.time() # set to now + self.last_flush = int(time.time()) # set to now self.mutation_limit: int = mutation_limit self.bigtable_table: Table = bigtable_table self.rows = {} @@ -62,11 +62,11 @@ def flush(self) -> None: mutated_rows.append(row) self.bigtable_table.mutate_rows(mutated_rows) self.rows.clear() - self.last_flush = time.time() # set to now + self.last_flush = int(time.time()) # set to now def check_flush(self) -> bool: - limit_reached = len(self.rows) >= self.mutation_limit) - time_exceeded = self.last_flush + self.mutation_freq < time.time() + limit_reached = len(self.rows) >= self.mutation_limit + time_exceeded = self.last_flush + self.mutation_freq < int(time.time()) return limit_reached or time_exceeded def submit(self, row: DirectRow, value: Optional[bytes] = None): @@ -475,6 +475,11 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: + if self._key_cache is not None: + for partition in self._active_partitions(): + for k in self._key_cache: + if k[0] == partition: + yield k[1:] for row in self._iteritems(): yield row[0] except Exception as ex: From dfae9d6bd30c995bc8d0997ddfb36f48981bdf23 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 10:22:12 +0200 Subject: [PATCH 164/616] formatting --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7764e8040..f83b5f1cb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -43,7 +43,7 @@ def __init__( self, bigtable_table: Table, mutation_freq: int, mutation_limit: int ) -> None: self.mutation_freq: int = mutation_freq - self.last_flush = int(time.time()) # set to now + self.last_flush = int(time.time()) # set to now self.mutation_limit: int = mutation_limit self.bigtable_table: Table = bigtable_table self.rows = {} @@ -62,7 +62,7 @@ def flush(self) -> None: mutated_rows.append(row) self.bigtable_table.mutate_rows(mutated_rows) self.rows.clear() - self.last_flush = int(time.time()) # set to now + self.last_flush = int(time.time()) # set to now def check_flush(self) -> bool: limit_reached = len(self.rows) >= self.mutation_limit From a6f6e389849d5695e8418804d90d137117b0ced3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 11:31:48 +0200 Subject: [PATCH 165/616] try to fix value cache --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f83b5f1cb..59ecb43e1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -270,6 +270,7 @@ def _cache_get( self.log.error( f"Cache inconsintency, mut_buf:{value}, cache{value_cache}" ) + value = value_cache return row, value def _bigtable_setup(self, table, options: Dict[str, Any]): From cbe3b024c5f25f067b6437d2a1eefa522e3665f5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 11:32:10 +0200 Subject: [PATCH 166/616] fixed value cache --- faust/stores/bigtable.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 59ecb43e1..66c1b18d3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -265,12 +265,7 @@ def _cache_get( row, value = self._mutation_buffer.rows.get(key, (None, None)) if self._cache is not None: if key in self._cache.keys(): - value_cache = self._cache[key] - if value is not None and value != value_cache: - self.log.error( - f"Cache inconsintency, mut_buf:{value}, cache{value_cache}" - ) - value = value_cache + value = self._cache[key] return row, value def _bigtable_setup(self, table, options: Dict[str, Any]): From 602063976d65de6bafc82c5cbc1f7b4840f7a5a9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 11:34:00 +0200 Subject: [PATCH 167/616] faster get cache call --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 66c1b18d3..4c1062e35 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -263,6 +263,8 @@ def _cache_get( value = None if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) + if value is not None: + return row, value if self._cache is not None: if key in self._cache.keys(): value = self._cache[key] From a63bca064806caa4c3116e3ee52ba1d105af643d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 12:04:43 +0200 Subject: [PATCH 168/616] fixed wrong iterkeys call --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4c1062e35..ec8144c04 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -478,8 +478,9 @@ def _iterkeys(self) -> Iterator[bytes]: for k in self._key_cache: if k[0] == partition: yield k[1:] - for row in self._iteritems(): - yield row[0] + else: + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " From 8f225b0f175ab829d239607b21dcdade7bbfc8a6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 14:03:56 +0200 Subject: [PATCH 169/616] made rows to flush threadsave --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ec8144c04..4633b7382 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -50,7 +50,8 @@ def __init__( def flush(self) -> None: mutated_rows = [] - for row, val in self.rows.values(): + rows_to_flush = list(self.rows.values()) # Needed for threadsafety + for row, val in rows_to_flush: if val is None: row.delete() else: From 73e9312341370d307c7ffdf56e32549764e2d82c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 15:08:29 +0200 Subject: [PATCH 170/616] removed warnings in _get --- faust/stores/bigtable.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4633b7382..a76905111 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -42,6 +42,7 @@ class BigtableMutationBuffer: def __init__( self, bigtable_table: Table, mutation_freq: int, mutation_limit: int ) -> None: + self.mutation_freq: int = mutation_freq self.last_flush = int(time.time()) # set to now self.mutation_limit: int = mutation_limit @@ -50,7 +51,8 @@ def __init__( def flush(self) -> None: mutated_rows = [] - rows_to_flush = list(self.rows.values()) # Needed for threadsafety + rows_to_flush = self.rows.copy() + self.rows.clear() for row, val in rows_to_flush: if val is None: row.delete() @@ -62,7 +64,6 @@ def flush(self) -> None: ) mutated_rows.append(row) self.bigtable_table.mutate_rows(mutated_rows) - self.rows.clear() self.last_flush = int(time.time()) # set to now def check_flush(self) -> bool: @@ -110,7 +111,7 @@ def _maybe_ttl_clear(self): if now > self.init_ts + self.ttl: self.data = {} self.ttl = None - self.log.info("Cleard startupcache because TTL is over") + self.log.info("BigtableStore: Cleard startupcache because TTL is over") def keys(self): return self.data.keys() @@ -409,9 +410,6 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: self._key_index[key] = partition return value - self.log.warning( - f"{key=} not found in {self.table_name} on {partition=}" - ) else: keys = set() for partition in self._partitions_for_key(key): @@ -425,8 +423,6 @@ def _get(self, key: bytes) -> Optional[bytes]: partition = key[1] self._key_index[key[1:]] = partition return value - # No key was found - self.log.warning(f"{key=} not found in {self.table_name}") return None except KeyError as ke: self.log.error( From 29e835533796f89682b7c402eeda630ec2e5472b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 15:48:39 +0200 Subject: [PATCH 171/616] removed key index from _contains --- faust/stores/bigtable.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a76905111..9431a06a8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -556,8 +556,6 @@ def _contains(self, key: bytes) -> bool: found = self._check_key_cache(key_with_partition) else: found = self._bigtable_get(key_with_partition) is not None - if found: - self._key_index[key] = partition return found else: for partition in self._partitions_for_key(key): @@ -566,7 +564,6 @@ def _contains(self, key: bytes) -> bool: ) if self.key_cache_enabled: if self._check_key_cache(key_with_partition): - self._key_index[key] = partition return True else: return ( From d404b1cd77e047ee7501675349896999b0e3cee5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 16:02:00 +0200 Subject: [PATCH 172/616] removed keycache from iteritems --- faust/stores/bigtable.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9431a06a8..adbac77f2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -111,7 +111,9 @@ def _maybe_ttl_clear(self): if now > self.init_ts + self.ttl: self.data = {} self.ttl = None - self.log.info("BigtableStore: Cleard startupcache because TTL is over") + self.log.info( + "BigtableStore: Cleard startupcache because TTL is over" + ) def keys(self): return self.data.keys() @@ -470,14 +472,14 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - if self._key_cache is not None: - for partition in self._active_partitions(): - for k in self._key_cache: - if k[0] == partition: - yield k[1:] - else: - for row in self._iteritems(): - yield row[0] + # if self._key_cache is not None: + # for partition in self._active_partitions(): + # for k in self._key_cache: + # if k[0] == partition: + # yield k[1:] + # else: + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " From 5c7de83f32edf59a00b058a86664ad7900c0ccd8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 16:17:40 +0200 Subject: [PATCH 173/616] added bigtable store name to init, to determine which table it was --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index adbac77f2..4c71b05f3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -298,7 +298,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): else: logging.getLogger(__name__).info( "BigTableStore: Using existing " - f"bigtablestore with {self.bt_table_name=}" + f"bigtablestore with {self.bt_table_name=} for {self.table_name}" ) @staticmethod From b9aa2196ac656141de2522f4de9d3bb1a92429d1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 16:18:35 +0200 Subject: [PATCH 174/616] also added name to previous instatiation of table --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4c71b05f3..9175e9fad 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -287,7 +287,8 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): logging.getLogger(__name__).info( - f"BigTableStore: Making new bigtablestore with {self.bt_table_name=}" + f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " + f"for {self.table_name}" ) # TODO: add columns families to options self.bt_table.create( From e355e66be005de99db6b1388797a3737d57ec592 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 16:22:33 +0200 Subject: [PATCH 175/616] fixed flush --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9175e9fad..65990a626 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -51,7 +51,7 @@ def __init__( def flush(self) -> None: mutated_rows = [] - rows_to_flush = self.rows.copy() + rows_to_flush = self.rows.copy().values() self.rows.clear() for row, val in rows_to_flush: if val is None: From bda5bef23feaa86a1e213a3909bd4136eb043ae0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Oct 2022 16:35:59 +0200 Subject: [PATCH 176/616] fixed naming --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 65990a626..df6b37688 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -288,7 +288,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): if not self.bt_table.exists(): logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " - f"for {self.table_name}" + f"for {self.table.name}" ) # TODO: add columns families to options self.bt_table.create( @@ -299,7 +299,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): else: logging.getLogger(__name__).info( "BigTableStore: Using existing " - f"bigtablestore with {self.bt_table_name=} for {self.table_name}" + f"bigtablestore with {self.bt_table_name=} for {self.table.name}" ) @staticmethod From 79f13c50ff8e5e1f22039cb813796ac313662609 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Oct 2022 08:55:25 +0200 Subject: [PATCH 177/616] fixed table call --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index df6b37688..fcd3a2ea8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -288,7 +288,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): if not self.bt_table.exists(): logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " - f"for {self.table.name}" + f"for {table.name}" ) # TODO: add columns families to options self.bt_table.create( @@ -299,7 +299,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): else: logging.getLogger(__name__).info( "BigTableStore: Using existing " - f"bigtablestore with {self.bt_table_name=} for {self.table.name}" + f"bigtablestore with {self.bt_table_name=} for {table.name}" ) @staticmethod From 10fea148e9fb6a1352dc30de553b115de7b0dc32 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Oct 2022 10:03:24 +0200 Subject: [PATCH 178/616] added check that checks if a row was not written successfully --- faust/stores/bigtable.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fcd3a2ea8..8645f1e8d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -47,12 +47,12 @@ def __init__( self.last_flush = int(time.time()) # set to now self.mutation_limit: int = mutation_limit self.bigtable_table: Table = bigtable_table + self.log = logging.getLogger(self.__class__.__name__) self.rows = {} def flush(self) -> None: mutated_rows = [] rows_to_flush = self.rows.copy().values() - self.rows.clear() for row, val in rows_to_flush: if val is None: row.delete() @@ -63,7 +63,17 @@ def flush(self) -> None: val, ) mutated_rows.append(row) - self.bigtable_table.mutate_rows(mutated_rows) + response = self.bigtable_table.mutate_rows(mutated_rows) + for (status, row) in zip(response, rows_to_flush): + if status.code != 0: + self.log.error( + "BigTableStore: BigtableMutationBuffer, " + f"Row number {row[0].row_key} failed to write" + ) + else: + # Remove only rows that were successfully written + self.rows.pop(row[0].row_key, None) + self.last_flush = int(time.time()) # set to now def check_flush(self) -> bool: From 1a980003ab786ae20c7c3a9e0b1cea76c740878c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Oct 2022 10:04:50 +0200 Subject: [PATCH 179/616] adjusted logging for less words --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8645f1e8d..6a253bb6e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -68,7 +68,7 @@ def flush(self) -> None: if status.code != 0: self.log.error( "BigTableStore: BigtableMutationBuffer, " - f"Row number {row[0].row_key} failed to write" + f"Row {row[0].row_key} failed to write" ) else: # Remove only rows that were successfully written From 1d2374a1c851c6a4daf12947ef1f1ae1e09df203 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 10:09:41 +0200 Subject: [PATCH 180/616] added logging for filling table at startup --- faust/stores/bigtable.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6a253bb6e..faf67cc36 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -128,12 +128,17 @@ def _maybe_ttl_clear(self): def keys(self): return self.data.keys() - def fill(self, table, offset_key_prefix) -> None: + def fill(self, table: Table, offset_key_prefix) -> None: + start_time = time.time() for row in table.read_rows(): if offset_key_prefix in row.row_key: continue row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val + self.log.info( + f"BigtableStore: StartupCache finished fill for {table.table_id} " + f"took {time.time() - start_time}s" + ) class BigTableStore(base.SerializedStore): From bd535e367d24c4b3da69dd45680a9a044d4ccdc0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 12:10:19 +0200 Subject: [PATCH 181/616] switched to better caches --- faust/stores/bigtable.py | 86 ++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index faf67cc36..d1d304457 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -97,6 +97,7 @@ def __init__(self, ttl: Optional[int]) -> None: self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) + self._filled_partitions: Set[int] = set() def __len__(self): return len(self.data) @@ -128,17 +129,45 @@ def _maybe_ttl_clear(self): def keys(self): return self.data.keys() - def fill(self, table: Table, offset_key_prefix) -> None: + def fill(self, table: Table, partition: int) -> None: start_time = time.time() - for row in table.read_rows(): - if offset_key_prefix in row.row_key: - continue + start_key = partition.to_bytes(1, "little") + end_key = (partition + 1).to_bytes(1, "little") + for row in table.read_rows(start_key, end_key): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val self.log.info( f"BigtableStore: StartupCache finished fill for {table.table_id} " f"took {time.time() - start_time}s" ) + self._filled_partitions.add(partition) + + def check_filled(self, partition: int) -> bool: + return partition in self._filled_partitions + + +class BigTableKeyCache: + _filled_partitions: Set[int] = set() + _keys: Set[bytes] = set() + + def fill(self, table: Table, partition: int): + start_key = partition.to_bytes(1, "little") + end_key = (partition + 1).to_bytes(1, "little") + for row in table.read_rows(start_key, end_key): + self.add(row.row_key) + self._filled_partitions.add(partition) + + def add(self, key: bytes): + self.add(key) + + def discard(self, key: bytes): + self._keys.discard(key) + + def exists(self, key: bytes) -> bool: + return key in self._keys + + def check_filled(self, partition: int) -> bool: + return partition in self._filled_partitions class BigTableStore(base.SerializedStore): @@ -151,6 +180,7 @@ class BigTableStore(base.SerializedStore): _key_index: LRUCache[bytes, int] _cache: Optional[Union[LRUCache[bytes, bytes], BigtableStartupCache]] _mutation_buffer: Optional[BigtableMutationBuffer] + _key_cache: Optional[BigTableKeyCache] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -236,15 +266,6 @@ def _setup_key_and_value_cache(self, options) -> None: BigTableStore.STARTUPCACHE_TTL_KEY, None ) self._cache = BigtableStartupCache(startup_cache_ttl) - start = time.time() - self._cache.fill(self.bt_table, self.offset_key_prefix.encode()) - end = time.time() - logging.getLogger(__name__).info( - f"Filled BigtableStartupCache in {start-end}s" - ) - if self.key_cache_enabled: - self._key_cache = set(self._cache.keys()) - elif self.value_cache_type == "forever": self._cache = LRUCache(limit=self.value_cache_size) else: @@ -252,14 +273,19 @@ def _setup_key_and_value_cache(self, options) -> None: f"VALUE_CACHE_TYPE '{self.value_cache_type}'" ) if self.key_cache_enabled and self._key_cache is None: - self._key_cache = set() - offset_prefix = self.offset_key_prefix.encode() - for row in self.bt_table.read_rows(): - if offset_prefix in row.row_key: - continue - self._key_cache.add(row.row_key) + self._key_cache = BigTableKeyCache() + + def _fill_caches_if_empty(self, partition: int): + if self._key_cache is not None: + if self._key_cache.check_filled(partition): + self._key_cache.fill(self.bt_table, partition) + if isinstance(self._cache, BigtableStartupCache): + if self._cache.check_filled(partition): + self._cache.fill(self.bt_table, partition) def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: + partition = key[0] + self._fill_caches_if_empty(partition) if self._cache is not None: self._cache[key] = value if self.mutation_buffer_enabled: @@ -268,6 +294,8 @@ def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: self._key_cache.add(key) def _cache_del(self, key: bytes, row: DirectRow) -> None: + partition = key[0] + self._fill_caches_if_empty(partition) if self.mutation_buffer_enabled: self._mutation_buffer.submit(row, None) if self._cache: @@ -280,6 +308,8 @@ def _cache_get( ) -> Tuple[Optional[DirectRow], Optional[bytes]]: row = None value = None + partition = key[0] + self._fill_caches_if_empty(partition) if self.mutation_buffer_enabled: row, value = self._mutation_buffer.rows.get(key, (None, None)) if value is not None: @@ -412,7 +442,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _check_key_cache(self, key): if self._key_cache: - return key in self._key_cache + return self._key_cache.exists(key) else: return False @@ -488,12 +518,6 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - # if self._key_cache is not None: - # for partition in self._active_partitions(): - # for k in self._key_cache: - # if k[0] == partition: - # yield k[1:] - # else: for row in self._iteritems(): yield row[0] except Exception as ex: @@ -570,8 +594,9 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) found = False - if self.key_cache_enabled: - found = self._check_key_cache(key_with_partition) + if self._key_cache is not None: + self._fill_caches_if_empty(partition) + found = self._key_cache.exists(key_with_partition) else: found = self._bigtable_get(key_with_partition) is not None return found @@ -580,8 +605,9 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self.key_cache_enabled: - if self._check_key_cache(key_with_partition): + if self._key_cache is not None: + self._fill_caches_if_empty(partition) + if self._key_cache.exists(key_with_partition): return True else: return ( From cf691c2b8c2db6a4678c5c9eb7a95521893c4f66 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 12:41:00 +0200 Subject: [PATCH 182/616] removed caching from recovery --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d1d304457..e502aec6f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -727,14 +727,12 @@ def apply_changelog_batch( row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() - self._cache_del(key, row) else: row.set_cell( self.column_family_id, self.column_name, msg.value, ) - self._cache_set(offset_key, row, msg.value) row_mutations.append(row) self._persist_changelog_batch( row_mutations, From 048214dad9c5d482880754eaed20925e4f21f0dc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 12:45:24 +0200 Subject: [PATCH 183/616] fixed fill of caches --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e502aec6f..9a8430fa7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -277,10 +277,10 @@ def _setup_key_and_value_cache(self, options) -> None: def _fill_caches_if_empty(self, partition: int): if self._key_cache is not None: - if self._key_cache.check_filled(partition): + if not self._key_cache.check_filled(partition): self._key_cache.fill(self.bt_table, partition) if isinstance(self._cache, BigtableStartupCache): - if self._cache.check_filled(partition): + if not self._cache.check_filled(partition): self._cache.fill(self.bt_table, partition) def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: From 65b26235fc15e3eef680373161e5416a358076eb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 13:07:43 +0200 Subject: [PATCH 184/616] fixed recursion --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9a8430fa7..fd5d8e31f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -158,7 +158,7 @@ def fill(self, table: Table, partition: int): self._filled_partitions.add(partition) def add(self, key: bytes): - self.add(key) + self._keys.add(key) def discard(self, key: bytes): self._keys.discard(key) From e4ac878d9dd5c14507387415d940aca5953714ec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 13:34:29 +0200 Subject: [PATCH 185/616] fixed _partitions for key --- faust/stores/bigtable.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fd5d8e31f..729d5da54 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -130,16 +130,11 @@ def keys(self): return self.data.keys() def fill(self, table: Table, partition: int) -> None: - start_time = time.time() start_key = partition.to_bytes(1, "little") end_key = (partition + 1).to_bytes(1, "little") for row in table.read_rows(start_key, end_key): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self.log.info( - f"BigtableStore: StartupCache finished fill for {table.table_id} " - f"took {time.time() - start_time}s" - ) self._filled_partitions.add(partition) def check_filled(self, partition: int) -> bool: @@ -278,10 +273,14 @@ def _setup_key_and_value_cache(self, options) -> None: def _fill_caches_if_empty(self, partition: int): if self._key_cache is not None: if not self._key_cache.check_filled(partition): + start_time = time.time() self._key_cache.fill(self.bt_table, partition) + td = time.time() - start_time + self.log.info(f"KeyCache fill took {td}s for {self.table_name}:{partition}") if isinstance(self._cache, BigtableStartupCache): if not self._cache.check_filled(partition): self._cache.fill(self.bt_table, partition) + self.log.info(f"KeyCache fill took {td}s for {self.table_name}:{partition}") def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: partition = key[0] @@ -438,7 +437,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return range(self.app.conf.topic_partitions) + return range(self._active_partitions()) def _check_key_cache(self, key): if self._key_cache: From bd56e396e36b6bc858399adfab7295b2742d0014 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 13:44:26 +0200 Subject: [PATCH 186/616] removed range --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 729d5da54..3bb050447 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -437,7 +437,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return range(self._active_partitions()) + return self._active_partitions() def _check_key_cache(self, key): if self._key_cache: From bbdc82b3d5317567f463cc9d9f2d5295be89965f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 13:46:25 +0200 Subject: [PATCH 187/616] fixed active partitions call --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3bb050447..e20a4d960 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -437,7 +437,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return self._active_partitions() + return set(self._active_partitions()) def _check_key_cache(self, key): if self._key_cache: From 5b11cda4223ae5853a63db9f7fe785fcfe64bc10 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Oct 2022 14:01:35 +0200 Subject: [PATCH 188/616] faster _keyfor partitions --- faust/stores/bigtable.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e20a4d960..205df8fe6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -276,11 +276,15 @@ def _fill_caches_if_empty(self, partition: int): start_time = time.time() self._key_cache.fill(self.bt_table, partition) td = time.time() - start_time - self.log.info(f"KeyCache fill took {td}s for {self.table_name}:{partition}") + self.log.info( + f"KeyCache fill took {td}s for {self.table_name}:{partition}" + ) if isinstance(self._cache, BigtableStartupCache): if not self._cache.check_filled(partition): self._cache.fill(self.bt_table, partition) - self.log.info(f"KeyCache fill took {td}s for {self.table_name}:{partition}") + self.log.info( + f"KeyCache fill took {td}s for {self.table_name}:{partition}" + ) def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: partition = key[0] @@ -437,7 +441,19 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: return [self._key_index[key]] except KeyError: - return set(self._active_partitions()) + active_partitions = set(self._active_partitions()) + if self._key_cache is not None: + for partition in active_partitions: + if not self._key_cache.check_filled(partition): + continue + else: + self._fill_caches_if_empty(partition) + key_with_partition = self._get_key_with_partition( + key, partition + ) + if self._key_cache.exists(key_with_partition): + return [partition] + return active_partitions def _check_key_cache(self, key): if self._key_cache: From 39e9b3558f73204929eb959ef0040aa127786abf Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 2 Nov 2022 14:33:58 +0100 Subject: [PATCH 189/616] added cachemanager --- faust/stores/bigtable.py | 360 +++++++++++++++++++++------------------ 1 file changed, 198 insertions(+), 162 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 205df8fe6..6230497ec 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -18,8 +18,8 @@ from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row import DirectRow -from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.row_filters import CellsColumnLimitFilter +from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache from yarl import URL @@ -165,17 +165,170 @@ def check_filled(self, partition: int) -> bool: return partition in self._filled_partitions +class BigTableCacheManager: + _partition_cache: LRUCache[bytes, int] + _value_cache: Optional[ + Union[LRUCache[bytes, Union[bytes, None]], BigtableStartupCache] + ] + _mutation_buffer: Optional[BigtableMutationBuffer] + _key_cache: Optional[BigTableKeyCache] + + def __init__(self, app, options: Dict, bt_table) -> None: + self._registered_partitions = set() + self.bt_table = bt_table + self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) + self._init_value_cache(options) + self._init_key_cache(options) + self._init_mutation_buffer(options, bt_table) + + def _fill_caches(self, partition: int): + if self._key_cache is not None: + self._key_cache.fill(self.bt_table, partition) + if isinstance(self._value_cache, BigtableStartupCache): + self._value_cache.fill(self.bt_table, partition) + + @staticmethod + def _register_partition(func): + def inner(self, bt_key: bytes, *args): + partition = bt_key[0] + if partition not in self._registered_partitions: + self._fill_caches(self, partition) + return func(self, bt_key) + + return inner + + @_register_partition + def get( + self, bt_key: bytes + ) -> Tuple[Optional[DirectRow], Optional[bytes]]: + row = None + value = None + if self._mutation_buffer is not None: + row, value = self._mutation_buffer.rows.get(bt_key, (None, None)) + if value is not None: + return row, value + if self._value_cache is not None: + if bt_key in self._value_cache.keys(): + value = self._value_cache[bt_key] + return row, value + + def get_partition(self, user_key: bytes) -> int: + return self._partition_cache[user_key] + + def set_partition(self, user_key: bytes, partition: int): + self._partition_cache[user_key] = partition + + def get_mutation_buffer(self) -> Optional[BigtableMutationBuffer]: + return self._mutation_buffer + + @_register_partition + def contains(self, bt_key: bytes) -> Optional[bool]: + """ + If we return None here, this means, that no assumption + about the current key can be made. + """ + user_key = bt_key[1:] + if user_key in self._partition_cache.keys(): + return True + if self._key_cache is not None: + if self._key_cache.exists(bt_key): + return True + else: + return False + if self._value_cache is not None: + if bt_key in self._value_cache.keys(): + return True + if self._mutation_buffer is not None: + value = self._mutation_buffer.rows.get(bt_key, (None, None))[1] + if value is not None: + return True + + def get_key_iterator_if_exists(self) -> Optional[Iterable]: + if ( + self._key_cache is not None + and len(self._registered_partitions) > 0 + ): + return self._key_cache._keys + else: + return None + + @_register_partition + def set( + self, bt_key: bytes, row: DirectRow, value: Optional[bytes] + ) -> None: + if self._value_cache is not None: + self._value_cache[bt_key] = value + if self._mutation_buffer is not None: + self._mutation_buffer.submit(row, value) + if self._key_cache: + self._key_cache.add(bt_key) + + def delete(self, bt_key: bytes, row: DirectRow) -> None: + user_key = bt_key[1:] + self._partition_cache.pop(user_key, None) + if self._mutation_buffer is not None: + self._mutation_buffer.submit(row, None) + if self._value_cache is not None: + del self._value_cache[bt_key] + if self._key_cache: + self._key_cache.discard(bt_key) + + def _init_value_cache( + self, options + ) -> Optional[Union[LRUCache, BigtableStartupCache]]: + value_cache_type = options.get( + BigTableStore.VALUE_CACHE_TYPE_KEY, None + ) + + if value_cache_type == "startup": + startup_cache_ttl = options.get( + BigTableStore.STARTUPCACHE_TTL_KEY, None + ) + self._value_cache = BigtableStartupCache(startup_cache_ttl) + elif value_cache_type == "forever": + value_cache_size = options.get( + BigTableStore.VALUE_CACHE_SIZE_KEY, 1_000 + ) + + self._value_cache = LRUCache(limit=value_cache_size) + else: + self._value_cache = None + + def _init_key_cache(self, options: Dict): + key_cache_enabled = options.get( + BigTableStore.KEY_CACHE_ENABLE_KEY, False + ) + if key_cache_enabled: + self._key_cache = BigTableKeyCache() + else: + self._key_cache = None + + def _init_mutation_buffer(self, options: Dict, bt_table: Table): + mutation_buffer_enabled = options.get( + BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False + ) + if mutation_buffer_enabled: + limit = options.get( + BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 + ) + freq = options.get( + BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30 * 60 + ) + self._mutation_buffer = BigtableMutationBuffer( + bt_table, freq, limit + ) + else: + self._mutation_buffer = None + + class BigTableStore(base.SerializedStore): """Bigtable table storage.""" client: Client instance: Instance bt_table: Table + _cache: BigTableCacheManager - _key_index: LRUCache[bytes, int] - _cache: Optional[Union[LRUCache[bytes, bytes], BigtableStartupCache]] - _mutation_buffer: Optional[BigtableMutationBuffer] - _key_cache: Optional[BigTableKeyCache] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -200,39 +353,21 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(app, options) - self._key_index = LRUCache(limit=app.conf.table_key_index_size) - self._cache = None - self._key_cache = None - self._mutation_buffer = None try: self._bigtable_setup(table, options) - self._setup_mutation_buffer(options) - self._setup_key_and_value_cache(options) + self._cache = BigTableCacheManager(app, options, self.bt_table) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) def _set_options(self, app, options) -> None: - self.key_cache_enabled = options.get( - BigTableStore.KEY_CACHE_ENABLE_KEY, False - ) - self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) self.bt_start_key, self.bt_end_key = options.get( BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] ) - self.value_cache_type = options.get( - BigTableStore.VALUE_CACHE_TYPE_KEY, None - ) - self.value_cache_size = options.get( - BigTableStore.VALUE_CACHE_SIZE_KEY, app.conf.table_key_index_size - ) - self.mutation_buffer_enabled = options.get( - BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False - ) self.column_name = options.get( BigTableStore.BT_COLUMN_NAME_KEY, "DATA" ) @@ -243,85 +378,6 @@ def _set_options(self, app, options) -> None: BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" ) - def _setup_mutation_buffer(self, options) -> None: - if self.mutation_buffer_enabled: - limit = options.get( - BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 - ) - freq = options.get( - BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30 * 60 - ) - self._mutation_buffer = BigtableMutationBuffer( - self.bt_table, freq, limit - ) - - def _setup_key_and_value_cache(self, options) -> None: - if self.value_cache_type == "startup": - startup_cache_ttl = options.get( - BigTableStore.STARTUPCACHE_TTL_KEY, None - ) - self._cache = BigtableStartupCache(startup_cache_ttl) - elif self.value_cache_type == "forever": - self._cache = LRUCache(limit=self.value_cache_size) - else: - raise NotImplementedError( - f"VALUE_CACHE_TYPE '{self.value_cache_type}'" - ) - if self.key_cache_enabled and self._key_cache is None: - self._key_cache = BigTableKeyCache() - - def _fill_caches_if_empty(self, partition: int): - if self._key_cache is not None: - if not self._key_cache.check_filled(partition): - start_time = time.time() - self._key_cache.fill(self.bt_table, partition) - td = time.time() - start_time - self.log.info( - f"KeyCache fill took {td}s for {self.table_name}:{partition}" - ) - if isinstance(self._cache, BigtableStartupCache): - if not self._cache.check_filled(partition): - self._cache.fill(self.bt_table, partition) - self.log.info( - f"KeyCache fill took {td}s for {self.table_name}:{partition}" - ) - - def _cache_set(self, key: bytes, row: DirectRow, value: bytes) -> None: - partition = key[0] - self._fill_caches_if_empty(partition) - if self._cache is not None: - self._cache[key] = value - if self.mutation_buffer_enabled: - self._mutation_buffer.submit(row, value) - if self._key_cache: - self._key_cache.add(key) - - def _cache_del(self, key: bytes, row: DirectRow) -> None: - partition = key[0] - self._fill_caches_if_empty(partition) - if self.mutation_buffer_enabled: - self._mutation_buffer.submit(row, None) - if self._cache: - del self._cache[key] - if self._key_cache: - self._key_cache.discard(key) - - def _cache_get( - self, key: bytes - ) -> Tuple[Optional[DirectRow], Optional[bytes]]: - row = None - value = None - partition = key[0] - self._fill_caches_if_empty(partition) - if self.mutation_buffer_enabled: - row, value = self._mutation_buffer.rows.get(key, (None, None)) - if value is not None: - return row, value - if self._cache is not None: - if key in self._cache.keys(): - value = self._cache[key] - return row, value - def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: Client = Client( @@ -338,7 +394,6 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " f"for {table.name}" ) - # TODO: add columns families to options self.bt_table.create( column_families={ self.column_family_id: column_family.MaxVersionsGCRule(1) @@ -354,8 +409,8 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtable_get(self, key: bytes): - row, value = self._cache_get(key) + def _bigtable_get(self, key: bytes) -> Optional[bytes]: + row, value = self._cache.get(key) if value is not None: return value elif row is not None and value is None: @@ -375,7 +430,7 @@ def _bigtable_get_range( ) -> Tuple[bytes, Optional[bytes]]: # first search cache: for key in keys: - row, value = self._cache_get(key) + row, value = self._cache.get(key) if value is not None: return key, value elif row is not None and value is None: @@ -393,16 +448,16 @@ def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): if not persist_offset: - row = self._cache_get(key)[0] + row = self._cache.get(key)[0] if row is None: row = self.bt_table.direct_row(key) - if not self.mutation_buffer_enabled: + if self._cache.get_mutation_buffer() is None: row.set_cell( self.column_family_id, self.column_name, value, ) - self._cache_set(key, row, value) + self._cache.set(key, row, value) else: row = self.bt_table.direct_row(key) row.set_cell( @@ -413,13 +468,13 @@ def _bigtable_set( row.commit() def _bigtable_del(self, key: bytes): - row = self._cache_get(key)[0] + row = self._cache.get(key)[0] if row is None: row = self.bt_table.direct_row(key) - if not self.mutation_buffer_enabled: + if self._cache.get_mutation_buffer() is None: row.delete() row.commit() - self._cache_del(key, row) + self._cache.delete(key, row) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -439,27 +494,9 @@ def _get_key_with_partition(self, key: bytes, partition): def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: - return [self._key_index[key]] + return [self._cache.get_partition(key)] except KeyError: - active_partitions = set(self._active_partitions()) - if self._key_cache is not None: - for partition in active_partitions: - if not self._key_cache.check_filled(partition): - continue - else: - self._fill_caches_if_empty(partition) - key_with_partition = self._get_key_with_partition( - key, partition - ) - if self._key_cache.exists(key_with_partition): - return [partition] - return active_partitions - - def _check_key_cache(self, key): - if self._key_cache: - return self._key_cache.exists(key) - else: - return False + return self._active_partitions() def _get(self, key: bytes) -> Optional[bytes]: try: @@ -471,7 +508,7 @@ def _get(self, key: bytes) -> Optional[bytes]: value = self._bigtable_get(key_with_partition) if value is not None: - self._key_index[key] = partition + self._cache.set_partition(key, partition) return value else: keys = set() @@ -483,8 +520,8 @@ def _get(self, key: bytes) -> Optional[bytes]: key, value = self._bigtable_get_range(keys) if value is not None: - partition = key[1] - self._key_index[key[1:]] = partition + partition = key[0] + self._cache.set_partition(key[1:], partition) return value return None except KeyError as ke: @@ -505,7 +542,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: key, partition=partition ) self._bigtable_set(key_with_partition, value) - self._key_index[key] = partition + self._cache.set_partition(key, partition) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -521,9 +558,6 @@ def _del(self, key: bytes) -> None: key, partition=partition ) self._bigtable_del(key_with_partition) - - if key in self._key_index: - del self._key_index[key] except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -533,8 +567,13 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - for row in self._iteritems(): - yield row[0] + cache_iterator = self._cache.get_key_iterator_if_exists() + if cache_iterator is not None: + for key in cache_iterator: + yield key[1:] + else: + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -575,15 +614,17 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: start_key=start_key, end_key=end_key, ): - if self.mutation_buffer_enabled: + mutation_buffer = self._cache.get_mutation_buffer() + if mutation_buffer is not None: # We want to yield the mutation if any is buffered - mut_row, value = self._mutation_buffer.rows.get( + mut_row, value = mutation_buffer.rows.get( row.row_key, (None, None) ) if value is not None: yield (row.row_key[1:], value) continue elif mut_row is not None: + # This means that row will be deleted continue yield ( row.row_key[1:], @@ -608,27 +649,21 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - found = False - if self._key_cache is not None: - self._fill_caches_if_empty(partition) - found = self._key_cache.exists(key_with_partition) - else: + found = self._cache.contains(key_with_partition) + if found is None: found = self._bigtable_get(key_with_partition) is not None return found else: + searched_keys = set() for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self._key_cache is not None: - self._fill_caches_if_empty(partition) - if self._key_cache.exists(key_with_partition): - return True - else: - return ( - self._bigtable_get(key_with_partition) is not None - ) - return False + if self._cache.contains(key_with_partition): + return True + searched_keys.add(key_with_partition) + + return self._bigtable_get_range(searched_keys)[1] is not None except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " @@ -679,14 +714,15 @@ def set_persisted_offset( we were not an active replica. """ try: - if self.mutation_buffer_enabled and not recovery: - if self._mutation_buffer.check_flush(): - num_mutations = len(self._mutation_buffer.rows) + mutation_buffer = self._cache.get_mutation_buffer() + if mutation_buffer is not None and not recovery: + if mutation_buffer.check_flush(): + num_mutations = len(mutation_buffer.rows) self.log.info( f"Will flush BigtableMutationBuffer with {num_mutations} " f"mutations for table {self.table_name}..." ) - self._mutation_buffer.flush() + mutation_buffer.flush() offset_key = self.get_offset_key(tp).encode() self._bigtable_set( offset_key, str(offset).encode(), persist_offset=True From 0e8591d0dc685cae52fc7ef770d0fb1ccb81abf5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 2 Nov 2022 14:50:18 +0100 Subject: [PATCH 190/616] fixed decorator --- faust/stores/bigtable.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6230497ec..b8d1c3ad2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -165,6 +165,16 @@ def check_filled(self, partition: int) -> bool: return partition in self._filled_partitions +def _register_partition(func): + def inner(self, bt_key: bytes, *args): + partition = bt_key[0] + if partition not in self._registered_partitions: + self._fill_caches(self, partition) + return func(self, bt_key) + + return inner + + class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[ @@ -187,16 +197,6 @@ def _fill_caches(self, partition: int): if isinstance(self._value_cache, BigtableStartupCache): self._value_cache.fill(self.bt_table, partition) - @staticmethod - def _register_partition(func): - def inner(self, bt_key: bytes, *args): - partition = bt_key[0] - if partition not in self._registered_partitions: - self._fill_caches(self, partition) - return func(self, bt_key) - - return inner - @_register_partition def get( self, bt_key: bytes From 69f778f22d5b825a53b9134a55dc4040cd9350fb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 2 Nov 2022 15:28:56 +0100 Subject: [PATCH 191/616] fixed wrong call to fill caches --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b8d1c3ad2..db6bc1118 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -169,7 +169,7 @@ def _register_partition(func): def inner(self, bt_key: bytes, *args): partition = bt_key[0] if partition not in self._registered_partitions: - self._fill_caches(self, partition) + self._fill_caches(partition) return func(self, bt_key) return inner From f7cc8ccfd0873505a599e1afe014c9b8ff114fc9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 2 Nov 2022 16:09:58 +0100 Subject: [PATCH 192/616] fixed wrapper --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index db6bc1118..8208f05b5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -170,7 +170,7 @@ def inner(self, bt_key: bytes, *args): partition = bt_key[0] if partition not in self._registered_partitions: self._fill_caches(partition) - return func(self, bt_key) + return func(self, bt_key, *args) return inner From 64488bbd77a116cfc13eac00903e47965ec27df4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 2 Nov 2022 16:39:02 +0100 Subject: [PATCH 193/616] fixed endless reads --- faust/stores/bigtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8208f05b5..84e2bb777 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -170,6 +170,7 @@ def inner(self, bt_key: bytes, *args): partition = bt_key[0] if partition not in self._registered_partitions: self._fill_caches(partition) + self._registered_partitions.add(partition) return func(self, bt_key, *args) return inner @@ -192,10 +193,17 @@ def __init__(self, app, options: Dict, bt_table) -> None: self._init_mutation_buffer(options, bt_table) def _fill_caches(self, partition: int): + log = logging.getLogger(__name__) + start = time.time() if self._key_cache is not None: self._key_cache.fill(self.bt_table, partition) if isinstance(self._value_cache, BigtableStartupCache): self._value_cache.fill(self.bt_table, partition) + td = time.time() - start + log.info( + f"BigtabeStore: filled cache for {self.bt_table.id}:" + f"{partition} in {td}s" + ) @_register_partition def get( From 30d6731e8d4ed3270fe1a0bf589d64bc24f5c95e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 07:50:20 +0100 Subject: [PATCH 194/616] fixed type hints --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 84e2bb777..7bdf142d5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -360,7 +360,7 @@ def __init__( options: Dict[str, Any], **kwargs: Any, ) -> None: - self._set_options(app, options) + self._set_options(options) try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) @@ -369,7 +369,7 @@ def __init__( raise ex super().__init__(url, app, table, **kwargs) - def _set_options(self, app, options) -> None: + def _set_options(self, options) -> None: self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) @@ -430,7 +430,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: self.log.info(f"{key=} not found in {self.table_name}") value = None else: - value: bytes = self.bigtable_exrtact_row_data(res) + value = self.bigtable_exrtact_row_data(res) return value def _bigtable_get_range( From d596cc33aac70b50617a6170a92f74033837b0c5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 07:57:03 +0100 Subject: [PATCH 195/616] fixe table id call --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7bdf142d5..881cf0da3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -184,7 +184,7 @@ class BigTableCacheManager: _mutation_buffer: Optional[BigtableMutationBuffer] _key_cache: Optional[BigTableKeyCache] - def __init__(self, app, options: Dict, bt_table) -> None: + def __init__(self, app, options: Dict, bt_table: Table) -> None: self._registered_partitions = set() self.bt_table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) @@ -201,7 +201,7 @@ def _fill_caches(self, partition: int): self._value_cache.fill(self.bt_table, partition) td = time.time() - start log.info( - f"BigtabeStore: filled cache for {self.bt_table.id}:" + f"BigtabeStore: filled cache for {self.bt_table.table_id}:" f"{partition} in {td}s" ) From 81e87b9ea870d331a15b33fbaff6b4651d95f2ca Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 08:17:32 +0100 Subject: [PATCH 196/616] fixed contains get range --- faust/stores/bigtable.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 881cf0da3..c6f07628f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -201,7 +201,7 @@ def _fill_caches(self, partition: int): self._value_cache.fill(self.bt_table, partition) td = time.time() - start log.info( - f"BigtabeStore: filled cache for {self.bt_table.table_id}:" + f"BigTabeStore: filled cache for {self.bt_table.table_id}:" f"{partition} in {td}s" ) @@ -435,7 +435,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_get_range( self, keys: Set[bytes] - ) -> Tuple[bytes, Optional[bytes]]: + ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: for key in keys: row, value = self._cache.get(key) @@ -451,6 +451,8 @@ def _bigtable_get_range( for row in self.bt_table.read_rows(row_set=rows): # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) + # Not found + return None, None def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False From ef44521a8491b9d8de1c98064a72389c2d361713 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 09:49:37 +0100 Subject: [PATCH 197/616] faster cache init --- faust/stores/bigtable.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c6f07628f..9e7eeec76 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,13 +195,16 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: def _fill_caches(self, partition: int): log = logging.getLogger(__name__) start = time.time() - if self._key_cache is not None: - self._key_cache.fill(self.bt_table, partition) if isinstance(self._value_cache, BigtableStartupCache): self._value_cache.fill(self.bt_table, partition) + if self._key_cache is not None: + self._key_cache._keys = set(self._value_cache.keys()) + else: + if self._key_cache is not None: + self._key_cache.fill(self.bt_table, partition) td = time.time() - start log.info( - f"BigTabeStore: filled cache for {self.bt_table.table_id}:" + f"BigTableStore: filled cache for {self.bt_table.table_id}:" f"{partition} in {td}s" ) From 101e20224bbf3ce1e16eb516e705ea4d3e875603 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 11:22:35 +0100 Subject: [PATCH 198/616] try faster _contains method --- faust/stores/bigtable.py | 51 ++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9e7eeec76..df6d6fdfe 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -142,7 +142,6 @@ def check_filled(self, partition: int) -> bool: class BigTableKeyCache: - _filled_partitions: Set[int] = set() _keys: Set[bytes] = set() def fill(self, table: Table, partition: int): @@ -150,7 +149,6 @@ def fill(self, table: Table, partition: int): end_key = (partition + 1).to_bytes(1, "little") for row in table.read_rows(start_key, end_key): self.add(row.row_key) - self._filled_partitions.add(partition) def add(self, key: bytes): self._keys.add(key) @@ -161,9 +159,6 @@ def discard(self, key: bytes): def exists(self, key: bytes) -> bool: return key in self._keys - def check_filled(self, partition: int) -> bool: - return partition in self._filled_partitions - def _register_partition(func): def inner(self, bt_key: bytes, *args): @@ -242,17 +237,38 @@ def contains(self, bt_key: bytes) -> Optional[bool]: if user_key in self._partition_cache.keys(): return True if self._key_cache is not None: - if self._key_cache.exists(bt_key): - return True - else: - return False - if self._value_cache is not None: - if bt_key in self._value_cache.keys(): - return True + return self._key_cache.exists(bt_key) if self._mutation_buffer is not None: value = self._mutation_buffer.rows.get(bt_key, (None, None))[1] if value is not None: return True + if self._value_cache is not None: + if ( + isinstance(self._value_cache, BigtableStartupCache) + and self._value_cache.ttl_over is False + ): + return bt_key in self._value_cache.keys() + else: + if bt_key in self._value_cache.keys(): + return True + return None + + def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + definately_not_found = [] + for key in key_set: + found = self.contains(key) + if found is True: + return True + elif found is None: + definately_not_found.append(False) + else: + definately_not_found.append(True) + + if all(definately_not_found): + # Now we can be sure that the key does not exist + return False + # No assumption possible + return None def get_key_iterator_if_exists(self) -> Optional[Iterable]: if ( @@ -667,16 +683,17 @@ def _contains(self, key: bytes) -> bool: found = self._bigtable_get(key_with_partition) is not None return found else: - searched_keys = set() + keys_to_search = set() for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition ) - if self._cache.contains(key_with_partition): - return True - searched_keys.add(key_with_partition) + keys_to_search.add(key_with_partition) - return self._bigtable_get_range(searched_keys)[1] is not None + found = self._cache.contains_any(keys_to_search) + if found is None: + found = self._bigtable_get_range(keys_to_search)[1] is not None + return found except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From fbc945310d33616454d3eb8c2f7958e486b26cee Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 12:15:09 +0100 Subject: [PATCH 199/616] added set operations for faster contains calls --- faust/stores/bigtable.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index df6d6fdfe..48e078ebd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -97,7 +97,6 @@ def __init__(self, ttl: Optional[int]) -> None: self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) - self._filled_partitions: Set[int] = set() def __len__(self): return len(self.data) @@ -135,10 +134,6 @@ def fill(self, table: Table, partition: int) -> None: for row in table.read_rows(start_key, end_key): row_val = BigTableStore.bigtable_exrtact_row_data(row) self.data[row.row_key] = row_val - self._filled_partitions.add(partition) - - def check_filled(self, partition: int) -> bool: - return partition in self._filled_partitions class BigTableKeyCache: @@ -180,6 +175,7 @@ class BigTableCacheManager: _key_cache: Optional[BigTableKeyCache] def __init__(self, app, options: Dict, bt_table: Table) -> None: + self.log = logging.getLogger(__name__) self._registered_partitions = set() self.bt_table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) @@ -188,7 +184,6 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_mutation_buffer(options, bt_table) def _fill_caches(self, partition: int): - log = logging.getLogger(__name__) start = time.time() if isinstance(self._value_cache, BigtableStartupCache): self._value_cache.fill(self.bt_table, partition) @@ -198,7 +193,7 @@ def _fill_caches(self, partition: int): if self._key_cache is not None: self._key_cache.fill(self.bt_table, partition) td = time.time() - start - log.info( + self.log.info( f"BigTableStore: filled cache for {self.bt_table.table_id}:" f"{partition} in {td}s" ) @@ -254,6 +249,17 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + partitions = {k[0] for k in key_set} + # Check if all partitions are already registered + if self._registered_partitions.issuperset(partitions): + if self._key_cache is not None: + return not self._key_cache._keys.isdisjoint(key_set) + if ( + isinstance(self._value_cache, BigtableStartupCache) + and not self._value_cache.ttl_over + ): + return not self._value_cache.keys().isdisjoint(key_set) + definately_not_found = [] for key in key_set: found = self.contains(key) @@ -692,7 +698,9 @@ def _contains(self, key: bytes) -> bool: found = self._cache.contains_any(keys_to_search) if found is None: - found = self._bigtable_get_range(keys_to_search)[1] is not None + found = ( + self._bigtable_get_range(keys_to_search)[1] is not None + ) return found except Exception as ex: self.log.error( From 46ece5fde1e9a44876630015505a773d372621b9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 13:17:14 +0100 Subject: [PATCH 200/616] faster contains --- faust/stores/bigtable.py | 59 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 48e078ebd..e58980d88 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -128,23 +128,10 @@ def _maybe_ttl_clear(self): def keys(self): return self.data.keys() - def fill(self, table: Table, partition: int) -> None: - start_key = partition.to_bytes(1, "little") - end_key = (partition + 1).to_bytes(1, "little") - for row in table.read_rows(start_key, end_key): - row_val = BigTableStore.bigtable_exrtact_row_data(row) - self.data[row.row_key] = row_val - class BigTableKeyCache: _keys: Set[bytes] = set() - def fill(self, table: Table, partition: int): - start_key = partition.to_bytes(1, "little") - end_key = (partition + 1).to_bytes(1, "little") - for row in table.read_rows(start_key, end_key): - self.add(row.row_key) - def add(self, key: bytes): self._keys.add(key) @@ -159,8 +146,7 @@ def _register_partition(func): def inner(self, bt_key: bytes, *args): partition = bt_key[0] if partition not in self._registered_partitions: - self._fill_caches(partition) - self._registered_partitions.add(partition) + self._fill_caches({partition}) return func(self, bt_key, *args) return inner @@ -183,19 +169,28 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_key_cache(options) self._init_mutation_buffer(options, bt_table) - def _fill_caches(self, partition: int): + def _fill_caches(self, partitions: Set[int]): + partitions = self._registered_partitions.difference(partitions) + if len(partitions) == 0: + return # Nothing todo + row_set = RowSet() + for p in partitions: + row_set.add_row_range_with_prefix( + p.to_bytes(1, byteorder="little") + ) + start = time.time() - if isinstance(self._value_cache, BigtableStartupCache): - self._value_cache.fill(self.bt_table, partition) + for row in self.bt_table.read_rows(row_set=row_set): + if isinstance(self._value_cache, BigtableStartupCache): + row_val = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache.data[row.row_key] = row_val if self._key_cache is not None: - self._key_cache._keys = set(self._value_cache.keys()) - else: - if self._key_cache is not None: - self._key_cache.fill(self.bt_table, partition) + self._key_cache.add(row.row_key) + self._registered_partitions.update(partitions) td = time.time() - start self.log.info( f"BigTableStore: filled cache for {self.bt_table.table_id}:" - f"{partition} in {td}s" + f"{partitions=} in {td}s" ) @_register_partition @@ -250,15 +245,15 @@ def contains(self, bt_key: bytes) -> Optional[bool]: def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: partitions = {k[0] for k in key_set} - # Check if all partitions are already registered - if self._registered_partitions.issuperset(partitions): - if self._key_cache is not None: - return not self._key_cache._keys.isdisjoint(key_set) - if ( - isinstance(self._value_cache, BigtableStartupCache) - and not self._value_cache.ttl_over - ): - return not self._value_cache.keys().isdisjoint(key_set) + self._fill_caches(partitions) + + if self._key_cache is not None: + return not self._key_cache._keys.isdisjoint(key_set) + if ( + isinstance(self._value_cache, BigtableStartupCache) + and not self._value_cache.ttl_over + ): + return not self._value_cache.keys().isdisjoint(key_set) definately_not_found = [] for key in key_set: From bef2a22a35122161e92a6324651b1087b8ff4ddc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 13:27:01 +0100 Subject: [PATCH 201/616] fixed contains --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e58980d88..51662cd9c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -172,7 +172,7 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: def _fill_caches(self, partitions: Set[int]): partitions = self._registered_partitions.difference(partitions) if len(partitions) == 0: - return # Nothing todo + return # Nothing todo row_set = RowSet() for p in partitions: row_set.add_row_range_with_prefix( From 6bfa0329554110b0a9e578ec8dd096184199010f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 14:24:18 +0100 Subject: [PATCH 202/616] added higher deadline for faster responses --- faust/stores/bigtable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 51662cd9c..5277352eb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -18,8 +18,10 @@ from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row import DirectRow +from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet +from google.api_core.retry import Retry from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache from yarl import URL @@ -170,17 +172,21 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_mutation_buffer(options, bt_table) def _fill_caches(self, partitions: Set[int]): + partitions = self._registered_partitions.difference(partitions) if len(partitions) == 0: return # Nothing todo + start = time.time() row_set = RowSet() for p in partitions: row_set.add_row_range_with_prefix( p.to_bytes(1, byteorder="little") ) - start = time.time() - for row in self.bt_table.read_rows(row_set=row_set): + for row in self.bt_table.read_rows( + row_set=row_set, filter_=CellsColumnLimitFilter(1), + retry=DEFAULT_RETRY_READ_ROWS.with_deadline(10*60) # High deadline cause slow + ): if isinstance(self._value_cache, BigtableStartupCache): row_val = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache.data[row.row_key] = row_val From ed81377f74b7db32e9b372d90f75b90bb1ea8694 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 14:56:00 +0100 Subject: [PATCH 203/616] fixed fill caches --- faust/stores/bigtable.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5277352eb..654fbc528 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -147,8 +147,7 @@ def exists(self, key: bytes) -> bool: def _register_partition(func): def inner(self, bt_key: bytes, *args): partition = bt_key[0] - if partition not in self._registered_partitions: - self._fill_caches({partition}) + self._fill_caches({partition}) return func(self, bt_key, *args) return inner @@ -173,7 +172,7 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: def _fill_caches(self, partitions: Set[int]): - partitions = self._registered_partitions.difference(partitions) + partitions = partitions.difference(self._registered_partitions) if len(partitions) == 0: return # Nothing todo start = time.time() @@ -184,8 +183,11 @@ def _fill_caches(self, partitions: Set[int]): ) for row in self.bt_table.read_rows( - row_set=row_set, filter_=CellsColumnLimitFilter(1), - retry=DEFAULT_RETRY_READ_ROWS.with_deadline(10*60) # High deadline cause slow + row_set=row_set, + filter_=CellsColumnLimitFilter(1), + retry=DEFAULT_RETRY_READ_ROWS.with_deadline( + 10 * 60 + ), # High deadline cause slow ): if isinstance(self._value_cache, BigtableStartupCache): row_val = BigTableStore.bigtable_exrtact_row_data(row) From 14d192245be831ae32468106106e0277f89d6a82 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 15:30:43 +0100 Subject: [PATCH 204/616] fix to bytes call --- faust/stores/bigtable.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 654fbc528..c54737660 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -178,9 +178,7 @@ def _fill_caches(self, partitions: Set[int]): start = time.time() row_set = RowSet() for p in partitions: - row_set.add_row_range_with_prefix( - p.to_bytes(1, byteorder="little") - ) + row_set.add_row_range_with_prefix(bytes([p])) for row in self.bt_table.read_rows( row_set=row_set, From 4433a80dc484f271f45681df25b0ea5c472c26c5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 3 Nov 2022 15:48:45 +0100 Subject: [PATCH 205/616] fixed fill caches again --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c54737660..3d798a46c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -178,7 +178,7 @@ def _fill_caches(self, partitions: Set[int]): start = time.time() row_set = RowSet() for p in partitions: - row_set.add_row_range_with_prefix(bytes([p])) + row_set.add_row_range_with_prefix(chr(p)) for row in self.bt_table.read_rows( row_set=row_set, From be84184128f962abc152244d9ad04ea45da911fa Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 09:21:07 +0100 Subject: [PATCH 206/616] fixed key cache access --- faust/stores/bigtable.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3d798a46c..27cefb7e0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -21,7 +21,6 @@ from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet -from google.api_core.retry import Retry from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache from yarl import URL @@ -110,9 +109,10 @@ def __getitem__(self, key): return res def __setitem__(self, key, value) -> None: - self._maybe_ttl_clear() - if self.ttl is not None: - self.data[key] = value + if value is not None: + self._maybe_ttl_clear() + if self.ttl is not None: + self.data[key] = value def __delitem__(self, key): self.data.pop(key, None) @@ -229,11 +229,13 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - user_key = bt_key[1:] - if user_key in self._partition_cache.keys(): - return True if self._key_cache is not None: return self._key_cache.exists(bt_key) + if ( + isinstance(self._value_cache, BigtableStartupCache) + and not self._value_cache.ttl_over + ): + return bt_key in self._value_cache.keys() if self._mutation_buffer is not None: value = self._mutation_buffer.rows.get(bt_key, (None, None))[1] if value is not None: @@ -277,7 +279,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # No assumption possible return None - def get_key_iterator_if_exists(self) -> Optional[Iterable]: + def get_key_iterable_if_exists(self) -> Optional[Iterable]: if ( self._key_cache is not None and len(self._registered_partitions) > 0 @@ -294,7 +296,7 @@ def set( self._value_cache[bt_key] = value if self._mutation_buffer is not None: self._mutation_buffer.submit(row, value) - if self._key_cache: + if self._key_cache is not None: self._key_cache.add(bt_key) def delete(self, bt_key: bytes, row: DirectRow) -> None: @@ -304,7 +306,7 @@ def delete(self, bt_key: bytes, row: DirectRow) -> None: self._mutation_buffer.submit(row, None) if self._value_cache is not None: del self._value_cache[bt_key] - if self._key_cache: + if self._key_cache is not None: self._key_cache.discard(bt_key) def _init_value_cache( @@ -603,7 +605,7 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - cache_iterator = self._cache.get_key_iterator_if_exists() + cache_iterator = self._cache.get_key_iterable_if_exists() if cache_iterator is not None: for key in cache_iterator: yield key[1:] From 6b00ad8c2ee6bb2a59dfb02a81afe8a1e3772a2b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 10:25:38 +0100 Subject: [PATCH 207/616] logging and changes to cache get --- faust/stores/bigtable.py | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 27cefb7e0..999fd549e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -207,7 +207,7 @@ def get( value = None if self._mutation_buffer is not None: row, value = self._mutation_buffer.rows.get(bt_key, (None, None)) - if value is not None: + if row is not None: return row, value if self._value_cache is not None: if bt_key in self._value_cache.keys(): @@ -389,6 +389,7 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) + self._tracked_key = None try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) @@ -452,6 +453,9 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: elif row is not None and value is None: return value else: + self._log_and_maybe_set_tracked_key( + key[1:], "'not found in cache get'" + ) res = self.bt_table.read_row(key, filter_=self.row_filter) row = self.bt_table.direct_row(key) if res is None: @@ -476,10 +480,18 @@ def _bigtable_get_range( for key in keys: rows.add_row_key(key) - for row in self.bt_table.read_rows(row_set=rows): + self._log_and_maybe_set_tracked_key( + keys.pop()[1:], "'not found in cache get (no partition)'" + ) + for row in self.bt_table.read_rows( + row_set=rows, filter_=CellsColumnLimitFilter(1) + ): # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) # Not found + self._log_and_maybe_set_tracked_key( + keys.pop()[1:], "'not found in cache get (no partition)'" + ) return None, None def _bigtable_set( @@ -506,9 +518,8 @@ def _bigtable_set( row.commit() def _bigtable_del(self, key: bytes): - row = self._cache.get(key)[0] - if row is None: - row = self.bt_table.direct_row(key) + row = self.bt_table.direct_row(key) + self._log_and_maybe_set_tracked_key(key[1:], "'deleted'") if self._cache.get_mutation_buffer() is None: row.delete() row.commit() @@ -536,8 +547,16 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: except KeyError: return self._active_partitions() + def _log_and_maybe_set_tracked_key(self, key, msg): + if self._tracked_key is None: + self._tacked_key = key + self.log.info(f"Set tracked key to {key}: {msg}") + else: + self.log.info(f"Tracked {key=}: {msg}") + def _get(self, key: bytes) -> Optional[bytes]: try: + self._log_and_maybe_set_tracked_key(key, "'get'") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( @@ -575,6 +594,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: + self._log_and_maybe_set_tracked_key(key, "'set'") partition = get_current_partition() key_with_partition = self._get_key_with_partition( key, partition=partition @@ -591,6 +611,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: + self._log_and_maybe_set_tracked_key(key, "'del'") for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition @@ -682,6 +703,7 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: + self._log_and_maybe_set_tracked_key(key, "'contains'") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( @@ -704,6 +726,14 @@ def _contains(self, key: bytes) -> bool: found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) + elif found is False: + self._log_and_maybe_set_tracked_key( + key, "'not found in cache'" + ) + else: + self._log_and_maybe_set_tracked_key( + key, "'found in cache'" + ) return found except Exception as ex: self.log.error( From ee377c96db61945da9ca0e3f6e08ceddd11f90db Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 10:49:44 +0100 Subject: [PATCH 208/616] fixed tracked key log --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 999fd549e..9d52fdc75 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -551,7 +551,7 @@ def _log_and_maybe_set_tracked_key(self, key, msg): if self._tracked_key is None: self._tacked_key = key self.log.info(f"Set tracked key to {key}: {msg}") - else: + elif key == self._tacked_key: self.log.info(f"Tracked {key=}: {msg}") def _get(self, key: bytes) -> Optional[bytes]: From 47144114af08ffdbd6ed86ad73cc808089ae65c9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 11:15:20 +0100 Subject: [PATCH 209/616] fixed typos --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9d52fdc75..bb00d24cd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -549,9 +549,9 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _log_and_maybe_set_tracked_key(self, key, msg): if self._tracked_key is None: - self._tacked_key = key + self._tracked_key = key self.log.info(f"Set tracked key to {key}: {msg}") - elif key == self._tacked_key: + elif key == self._tracked_key: self.log.info(f"Tracked {key=}: {msg}") def _get(self, key: bytes) -> Optional[bytes]: From 606b8a68baf129e819043e2b86249da252e664e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 11:55:08 +0100 Subject: [PATCH 210/616] fixed some stuff in contains and mutation buffer --- faust/stores/bigtable.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bb00d24cd..c93a81767 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -124,7 +124,7 @@ def _maybe_ttl_clear(self): self.data = {} self.ttl = None self.log.info( - "BigtableStore: Cleard startupcache because TTL is over" + "BigTableStore: Cleard startupcache because TTL is over" ) def keys(self): @@ -229,6 +229,12 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ + if self._mutation_buffer is not None: + row, value = self._mutation_buffer.rows.get(bt_key, (None, None)) + if row is not None and value is not None: + return True + elif row is not None and value is None: + return False if self._key_cache is not None: return self._key_cache.exists(bt_key) if ( @@ -236,10 +242,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: and not self._value_cache.ttl_over ): return bt_key in self._value_cache.keys() - if self._mutation_buffer is not None: - value = self._mutation_buffer.rows.get(bt_key, (None, None))[1] - if value is not None: - return True if self._value_cache is not None: if ( isinstance(self._value_cache, BigtableStartupCache) @@ -255,6 +257,13 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: partitions = {k[0] for k in key_set} self._fill_caches(partitions) + if self._mutation_buffer is not None: + for k in key_set: + row, value = self._mutation_buffer.rows.get(k, (None, None)) + if row is not None and value is not None: + return True + elif row is not None and value is None: + return False if self._key_cache is not None: return not self._key_cache._keys.isdisjoint(key_set) if ( @@ -262,20 +271,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: and not self._value_cache.ttl_over ): return not self._value_cache.keys().isdisjoint(key_set) - - definately_not_found = [] - for key in key_set: - found = self.contains(key) - if found is True: - return True - elif found is None: - definately_not_found.append(False) - else: - definately_not_found.append(True) - - if all(definately_not_found): - # Now we can be sure that the key does not exist - return False # No assumption possible return None From 2b553af1e651295000028884e5e7796534951512 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 12:58:05 +0100 Subject: [PATCH 211/616] static tracked key --- faust/stores/bigtable.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c93a81767..e94754760 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -384,7 +384,7 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - self._tracked_key = None + self._tracked_key = b'635b961bac0a961c562f61bf' try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) @@ -543,10 +543,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: return self._active_partitions() def _log_and_maybe_set_tracked_key(self, key, msg): - if self._tracked_key is None: - self._tracked_key = key - self.log.info(f"Set tracked key to {key}: {msg}") - elif key == self._tracked_key: + if self._tracked_key in key: self.log.info(f"Tracked {key=}: {msg}") def _get(self, key: bytes) -> Optional[bytes]: From c954361e7dda2c3d93c911dbe916b6dd68d63f81 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 12:59:38 +0100 Subject: [PATCH 212/616] better logging --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e94754760..5156587e2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -544,7 +544,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _log_and_maybe_set_tracked_key(self, key, msg): if self._tracked_key in key: - self.log.info(f"Tracked {key=}: {msg}") + self.log.info(f"Tracked {self._tracked_key=}: {msg}") def _get(self, key: bytes) -> Optional[bytes]: try: From 282f8d1d87df1ad1d2d1726fd0f1cff72e5768d2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 13:39:24 +0100 Subject: [PATCH 213/616] fixed contains method --- faust/stores/bigtable.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5156587e2..45641f099 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -229,12 +229,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self._mutation_buffer is not None: - row, value = self._mutation_buffer.rows.get(bt_key, (None, None)) - if row is not None and value is not None: - return True - elif row is not None and value is None: - return False if self._key_cache is not None: return self._key_cache.exists(bt_key) if ( @@ -242,15 +236,13 @@ def contains(self, bt_key: bytes) -> Optional[bool]: and not self._value_cache.ttl_over ): return bt_key in self._value_cache.keys() - if self._value_cache is not None: - if ( - isinstance(self._value_cache, BigtableStartupCache) - and self._value_cache.ttl_over is False - ): - return bt_key in self._value_cache.keys() - else: - if bt_key in self._value_cache.keys(): + if self._mutation_buffer is not None: + if bt_key in self._mutation_buffer.rows.keys(): + row, value = self._mutation_buffer.rows[bt_key] + if row is not None and value is not None: return True + elif row is not None and value is None: + return False return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: From 725f26d551fcd5e1613c2e392bdb89639a74cbc2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 4 Nov 2022 13:51:31 +0100 Subject: [PATCH 214/616] faster mutation buffer get --- faust/stores/bigtable.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 45641f099..33febe250 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -248,14 +248,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: partitions = {k[0] for k in key_set} self._fill_caches(partitions) - - if self._mutation_buffer is not None: - for k in key_set: - row, value = self._mutation_buffer.rows.get(k, (None, None)) - if row is not None and value is not None: - return True - elif row is not None and value is None: - return False if self._key_cache is not None: return not self._key_cache._keys.isdisjoint(key_set) if ( @@ -263,6 +255,16 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: and not self._value_cache.ttl_over ): return not self._value_cache.keys().isdisjoint(key_set) + + if self._mutation_buffer is not None: + keys_in_buffer = key_set.intersection(self._mutation_buffer.rows.keys()) + if len(keys_in_buffer) == 1: + k = keys_in_buffer.pop() + row, value = self._mutation_buffer.rows[k] + if row is not None and value is not None: + return True + elif row is not None and value is None: + return False # No assumption possible return None From cb8f877627692413a97dcf356bb00082429a6e9b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 10:50:50 +0100 Subject: [PATCH 215/616] removed logs --- faust/stores/bigtable.py | 82 ++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 50 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 33febe250..ff4d094df 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -257,7 +257,9 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return not self._value_cache.keys().isdisjoint(key_set) if self._mutation_buffer is not None: - keys_in_buffer = key_set.intersection(self._mutation_buffer.rows.keys()) + keys_in_buffer = key_set.intersection( + self._mutation_buffer.rows.keys() + ) if len(keys_in_buffer) == 1: k = keys_in_buffer.pop() row, value = self._mutation_buffer.rows[k] @@ -268,7 +270,10 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # No assumption possible return None - def get_key_iterable_if_exists(self) -> Optional[Iterable]: + def get_key_iterable_if_exists( + self, partitions: Set[int] + ) -> Optional[Iterable]: + self._fill_caches(partitions=partitions) if ( self._key_cache is not None and len(self._registered_partitions) > 0 @@ -378,7 +383,7 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - self._tracked_key = b'635b961bac0a961c562f61bf' + self._tracked_key = b"635b961bac0a961c562f61bf" try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) @@ -442,9 +447,6 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: elif row is not None and value is None: return value else: - self._log_and_maybe_set_tracked_key( - key[1:], "'not found in cache get'" - ) res = self.bt_table.read_row(key, filter_=self.row_filter) row = self.bt_table.direct_row(key) if res is None: @@ -469,18 +471,12 @@ def _bigtable_get_range( for key in keys: rows.add_row_key(key) - self._log_and_maybe_set_tracked_key( - keys.pop()[1:], "'not found in cache get (no partition)'" - ) for row in self.bt_table.read_rows( row_set=rows, filter_=CellsColumnLimitFilter(1) ): # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) # Not found - self._log_and_maybe_set_tracked_key( - keys.pop()[1:], "'not found in cache get (no partition)'" - ) return None, None def _bigtable_set( @@ -508,7 +504,6 @@ def _bigtable_set( def _bigtable_del(self, key: bytes): row = self.bt_table.direct_row(key) - self._log_and_maybe_set_tracked_key(key[1:], "'deleted'") if self._cache.get_mutation_buffer() is None: row.delete() row.commit() @@ -536,13 +531,8 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: except KeyError: return self._active_partitions() - def _log_and_maybe_set_tracked_key(self, key, msg): - if self._tracked_key in key: - self.log.info(f"Tracked {self._tracked_key=}: {msg}") - def _get(self, key: bytes) -> Optional[bytes]: try: - self._log_and_maybe_set_tracked_key(key, "'get'") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( @@ -580,7 +570,6 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - self._log_and_maybe_set_tracked_key(key, "'set'") partition = get_current_partition() key_with_partition = self._get_key_with_partition( key, partition=partition @@ -597,7 +586,6 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - self._log_and_maybe_set_tracked_key(key, "'del'") for partition in self._partitions_for_key(key): key_with_partition = self._get_key_with_partition( key, partition=partition @@ -612,7 +600,9 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - cache_iterator = self._cache.get_key_iterable_if_exists() + cache_iterator = self._cache.get_key_iterable_if_exists( + set(self._active_partitions()) + ) if cache_iterator is not None: for key in cache_iterator: yield key[1:] @@ -650,31 +640,32 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: + row_set = RowSet() for partition in self._active_partitions(): partition_prefix = partition.to_bytes(1, "little") start_key = b"".join([partition_prefix, self.bt_start_key]) end_key = b"".join([partition_prefix, self.bt_end_key]) - - for row in self.bt_table.read_rows( - start_key=start_key, - end_key=end_key, - ): - mutation_buffer = self._cache.get_mutation_buffer() - if mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = mutation_buffer.rows.get( - row.row_key, (None, None) - ) - if value is not None: - yield (row.row_key[1:], value) - continue - elif mut_row is not None: - # This means that row will be deleted - continue - yield ( - row.row_key[1:], - self.bigtable_exrtact_row_data(row), + row_set.add_row_range_from_keys(start_key, end_key) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + mutation_buffer = self._cache.get_mutation_buffer() + if mutation_buffer is not None: + # We want to yield the mutation if any is buffered + mut_row, value = mutation_buffer.rows.get( + row.row_key, (None, None) ) + if value is not None: + yield (row.row_key[1:], value) + continue + elif mut_row is not None: + # This means that row will be deleted + continue + yield ( + row.row_key[1:], + self.bigtable_exrtact_row_data(row), + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -689,7 +680,6 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - self._log_and_maybe_set_tracked_key(key, "'contains'") partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( @@ -712,14 +702,6 @@ def _contains(self, key: bytes) -> bool: found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) - elif found is False: - self._log_and_maybe_set_tracked_key( - key, "'not found in cache'" - ) - else: - self._log_and_maybe_set_tracked_key( - key, "'found in cache'" - ) return found except Exception as ex: self.log.error( From 0fb464c952cef5c3b5ce6d70ee7f2d30978261f4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 10:51:05 +0100 Subject: [PATCH 216/616] removed tracked key --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ff4d094df..58c245bf3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -383,7 +383,6 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - self._tracked_key = b"635b961bac0a961c562f61bf" try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) From e2cc2b30deccc47037f9dede4514dc657ca140b6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 11:33:46 +0100 Subject: [PATCH 217/616] removed log when flushing mutation buffer --- faust/stores/bigtable.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 58c245bf3..8e50d68dd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -755,11 +755,6 @@ def set_persisted_offset( mutation_buffer = self._cache.get_mutation_buffer() if mutation_buffer is not None and not recovery: if mutation_buffer.check_flush(): - num_mutations = len(mutation_buffer.rows) - self.log.info( - f"Will flush BigtableMutationBuffer with {num_mutations} " - f"mutations for table {self.table_name}..." - ) mutation_buffer.flush() offset_key = self.get_offset_key(tp).encode() self._bigtable_set( From 021a695ec2ea0e4a798bb4d332bbbd85e0dac803 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 12:50:32 +0100 Subject: [PATCH 218/616] only do iterkeys from real table --- faust/stores/bigtable.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8e50d68dd..576a18fb0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -599,15 +599,21 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - cache_iterator = self._cache.get_key_iterable_if_exists( - set(self._active_partitions()) + #cache_iterator = self._cache.get_key_iterable_if_exists( + #set(self._active_partitions()) + #) + #if cache_iterator is not None: + #for key in cache_iterator: + #yield key[1:] + #else: + start = time.time() + for row in self._iteritems(): + yield row[0] + + end = time.time() + self.log.info( + f"Called iterkeys for {self.bt_table_name} took {end - start}" ) - if cache_iterator is not None: - for key in cache_iterator: - yield key[1:] - else: - for row in self._iteritems(): - yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " From 718fffb6a8c19f3b894dac22becaf09105241e93 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 13:53:20 +0100 Subject: [PATCH 219/616] try to fix ityerkeys call --- faust/stores/bigtable.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 576a18fb0..f15c6dd54 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -270,18 +270,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # No assumption possible return None - def get_key_iterable_if_exists( - self, partitions: Set[int] - ) -> Optional[Iterable]: - self._fill_caches(partitions=partitions) - if ( - self._key_cache is not None - and len(self._registered_partitions) > 0 - ): - return self._key_cache._keys - else: - return None - @_register_partition def set( self, bt_key: bytes, row: DirectRow, value: Optional[bytes] @@ -599,16 +587,15 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - #cache_iterator = self._cache.get_key_iterable_if_exists( - #set(self._active_partitions()) - #) - #if cache_iterator is not None: - #for key in cache_iterator: - #yield key[1:] - #else: start = time.time() - for row in self._iteritems(): - yield row[0] + # First check if all keys are cached! + if self._cache._key_cache is not None: + self._cache._fill_caches(set(self._active_partitions())) + for key in self._cache._key_cache._keys: + yield key[1:] + else: + for row in self._iteritems(): + yield row[0] end = time.time() self.log.info( From b8b81d83edc74fa017f5305cd67c38c8a91c48fd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 14:33:23 +0100 Subject: [PATCH 220/616] always yield keys from table directly --- faust/stores/bigtable.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f15c6dd54..c432e00fc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -587,20 +587,25 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - start = time.time() - # First check if all keys are cached! - if self._cache._key_cache is not None: - self._cache._fill_caches(set(self._active_partitions())) - for key in self._cache._key_cache._keys: - yield key[1:] - else: - for row in self._iteritems(): - yield row[0] + row_set = RowSet() + for partition in self._active_partitions(): + row_set.add_row_range_with_prefix(chr(partition)) - end = time.time() - self.log.info( - f"Called iterkeys for {self.bt_table_name} took {end - start}" - ) + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + if self._cache._mutation_buffer is not None: + # We want to yield the mutation if any is buffered + mut_row, value = self._cache._mutation_buffer.rows.get( + row.row_key, (None, None) + ) + if mut_row is not None and value is not None: + yield mut_row.row_key[1:] + continue + elif mut_row is not None and value is None: + # This means that row will be deleted on flush + continue + yield row.row_key[1:] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " From dde59bcb8aee4b897ae7b870a10a15de80636ede Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 15:29:15 +0100 Subject: [PATCH 221/616] set ttl over, and take value cache for iterkeys --- faust/stores/bigtable.py | 78 ++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c432e00fc..83142f601 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -126,6 +126,7 @@ def _maybe_ttl_clear(self): self.log.info( "BigTableStore: Cleard startupcache because TTL is over" ) + self.ttl_over = True def keys(self): return self.data.keys() @@ -587,25 +588,8 @@ def _del(self, key: bytes) -> None: def _iterkeys(self) -> Iterator[bytes]: try: - row_set = RowSet() - for partition in self._active_partitions(): - row_set.add_row_range_with_prefix(chr(partition)) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - if self._cache._mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = self._cache._mutation_buffer.rows.get( - row.row_key, (None, None) - ) - if mut_row is not None and value is not None: - yield mut_row.row_key[1:] - continue - elif mut_row is not None and value is None: - # This means that row will be deleted on flush - continue - yield row.row_key[1:] + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -637,32 +621,40 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - row_set = RowSet() - for partition in self._active_partitions(): - partition_prefix = partition.to_bytes(1, "little") - start_key = b"".join([partition_prefix, self.bt_start_key]) - end_key = b"".join([partition_prefix, self.bt_end_key]) - row_set.add_row_range_from_keys(start_key, end_key) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter + if ( + isinstance(self._cache._value_cache, BigtableStartupCache) + and self._cache._value_cache.ttl_over is False ): - mutation_buffer = self._cache.get_mutation_buffer() - if mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = mutation_buffer.rows.get( - row.row_key, (None, None) + self._cache._fill_caches(set(self._active_partitions())) + for k, v in self._cache._value_cache.data.items(): + yield k[1:], v + else: + row_set = RowSet() + for partition in self._active_partitions(): + partition_prefix = partition.to_bytes(1, "little") + start_key = b"".join([partition_prefix, self.bt_start_key]) + end_key = b"".join([partition_prefix, self.bt_end_key]) + row_set.add_row_range_from_keys(start_key, end_key) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + mutation_buffer = self._cache.get_mutation_buffer() + if mutation_buffer is not None: + # We want to yield the mutation if any is buffered + mut_row, value = mutation_buffer.rows.get( + row.row_key, (None, None) + ) + if value is not None: + yield (row.row_key[1:], value) + continue + elif mut_row is not None: + # This means that row will be deleted + continue + yield ( + row.row_key[1:], + self.bigtable_exrtact_row_data(row), ) - if value is not None: - yield (row.row_key[1:], value) - continue - elif mut_row is not None: - # This means that row will be deleted - continue - yield ( - row.row_key[1:], - self.bigtable_exrtact_row_data(row), - ) except Exception as ex: self.log.error( f"FaustBigtableException Error " From 96bc184f149d08f34d74cc1cb3a6f7ce297715b6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 7 Nov 2022 15:43:44 +0100 Subject: [PATCH 222/616] added log yo iterkeys and rearanged some functions --- faust/stores/bigtable.py | 53 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 83142f601..c490f827f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -586,29 +586,6 @@ def _del(self, key: bytes) -> None: ) raise ex - def _iterkeys(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[0] - except Exception as ex: - self.log.error( - f"FaustBigtableException Error in _iterkeys " - f"for table {self.table_name} exception {ex}" - ) - raise ex - - def _itervalues(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[1] - except Exception as ex: - self.log.error( - f"FaustBigtableException Error " - f"in _itervalues for table {self.table_name}" - f" exception {ex}" - ) - raise ex - def _active_partitions(self) -> Iterator[int]: actives = self.app.assignor.assigned_actives() topic = self.table.changelog_topic_name @@ -622,7 +599,7 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: if ( - isinstance(self._cache._value_cache, BigtableStartupCache) + isinstance(self._cache._value_cache, BigtableStartupCache) and self._cache._value_cache.ttl_over is False ): self._cache._fill_caches(set(self._active_partitions())) @@ -637,7 +614,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row_set.add_row_range_from_keys(start_key, end_key) for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter + row_set=row_set, filter_=self.row_filter ): mutation_buffer = self._cache.get_mutation_buffer() if mutation_buffer is not None: @@ -663,6 +640,32 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: ) raise ex + def _iterkeys(self) -> Iterator[bytes]: + try: + start = time.time() + for row in self._iteritems(): + yield row[0] + end = time.time() + self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in _iterkeys " + f"for table {self.table_name} exception {ex}" + ) + raise ex + + def _itervalues(self) -> Iterator[bytes]: + try: + for row in self._iteritems(): + yield row[1] + except Exception as ex: + self.log.error( + f"FaustBigtableException Error " + f"in _itervalues for table {self.table_name}" + f" exception {ex}" + ) + raise ex + def _size(self) -> int: """Always returns 0 for Bigtable.""" return 0 From 3ec3374d9c3fa23c87d84f5300936feae5a0e389 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 13:05:28 +0100 Subject: [PATCH 223/616] removed value_cache from iteritems --- faust/stores/bigtable.py | 55 +++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c490f827f..92076e244 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -598,40 +598,31 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - if ( - isinstance(self._cache._value_cache, BigtableStartupCache) - and self._cache._value_cache.ttl_over is False + row_set = RowSet() + for partition in self._active_partitions(): + partition_prefix = partition.to_bytes(1, "little") + start_key = b"".join([partition_prefix, self.bt_start_key]) + end_key = b"".join([partition_prefix, self.bt_end_key]) + row_set.add_row_range_from_keys(start_key, end_key) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter ): - self._cache._fill_caches(set(self._active_partitions())) - for k, v in self._cache._value_cache.data.items(): - yield k[1:], v - else: - row_set = RowSet() - for partition in self._active_partitions(): - partition_prefix = partition.to_bytes(1, "little") - start_key = b"".join([partition_prefix, self.bt_start_key]) - end_key = b"".join([partition_prefix, self.bt_end_key]) - row_set.add_row_range_from_keys(start_key, end_key) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - mutation_buffer = self._cache.get_mutation_buffer() - if mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = mutation_buffer.rows.get( - row.row_key, (None, None) - ) - if value is not None: - yield (row.row_key[1:], value) - continue - elif mut_row is not None: - # This means that row will be deleted - continue - yield ( - row.row_key[1:], - self.bigtable_exrtact_row_data(row), + if self._cache._mutation_buffer is not None: + # We want to yield the mutation if any is buffered + mut_row, value = self._cache._mutation_buffer.rows.get( + row.row_key, (None, None) ) + if value is not None: + yield (row.row_key[1:], value) + continue + elif mut_row is not None: + # This means that row will be deleted + continue + yield ( + row.row_key[1:], + self.bigtable_exrtact_row_data(row), + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " From 9f61a993b743d1faf6f6a4d09c442d8b18fbe516 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 14:29:24 +0100 Subject: [PATCH 224/616] fast return of contains if store_check_exists is false --- faust/stores/bigtable.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 92076e244..9c7b67500 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -634,8 +634,11 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: def _iterkeys(self) -> Iterator[bytes]: try: start = time.time() - for row in self._iteritems(): - yield row[0] + if self._cache._key_cache is not None: + yield from self._cache._key_cache._keys + else: + for row in self._iteritems(): + yield row[0] end = time.time() self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") except Exception as ex: @@ -663,6 +666,8 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: + if not self.app.conf.store_check_exists: + return True partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( From 7776c8dfee2c97d24203a45f396475d97ba742d2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 15:13:40 +0100 Subject: [PATCH 225/616] added missing log --- faust/tables/table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/tables/table.py b/faust/tables/table.py index a38d32859..548712709 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -52,6 +52,7 @@ def tumbling( ) def __missing__(self, key: KT) -> VT: + self.log.info(f"BigTableStore: __missing__ called for {key=}") if self.default is not None: return self.default() raise KeyError(key) From 9efc854c51427595a61bd6bf5e134c6b59b4ca9d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 15:22:39 +0100 Subject: [PATCH 226/616] return key error on not found --- faust/stores/bigtable.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9c7b67500..24f336181 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -433,16 +433,14 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: if value is not None: return value elif row is not None and value is None: - return value + raise KeyError(f"{key=} not found in {self.table_name}") else: res = self.bt_table.read_row(key, filter_=self.row_filter) row = self.bt_table.direct_row(key) if res is None: - self.log.info(f"{key=} not found in {self.table_name}") - value = None + raise KeyError(f"{key=} not found in {self.table_name}") else: - value = self.bigtable_exrtact_row_data(res) - return value + return self.bigtable_exrtact_row_data(res) def _bigtable_get_range( self, keys: Set[bytes] @@ -465,7 +463,7 @@ def _bigtable_get_range( # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) # Not found - return None, None + raise KeyError def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False @@ -675,7 +673,10 @@ def _contains(self, key: bytes) -> bool: ) found = self._cache.contains(key_with_partition) if found is None: - found = self._bigtable_get(key_with_partition) is not None + try: + found = self._bigtable_get(key_with_partition) is not None + except KeyError: + found = False return found else: keys_to_search = set() @@ -687,9 +688,12 @@ def _contains(self, key: bytes) -> bool: found = self._cache.contains_any(keys_to_search) if found is None: - found = ( - self._bigtable_get_range(keys_to_search)[1] is not None - ) + try: + found = ( + self._bigtable_get_range(keys_to_search)[1] is not None + ) + except KeyError: + found = False return found except Exception as ex: self.log.error( From e4b2766f4750ed9d2648878420b200ff308b91cb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 15:46:19 +0100 Subject: [PATCH 227/616] Revert "return key error on not found" This reverts commit 9efc854c51427595a61bd6bf5e134c6b59b4ca9d. --- faust/stores/bigtable.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 24f336181..9c7b67500 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -433,14 +433,16 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: if value is not None: return value elif row is not None and value is None: - raise KeyError(f"{key=} not found in {self.table_name}") + return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) row = self.bt_table.direct_row(key) if res is None: - raise KeyError(f"{key=} not found in {self.table_name}") + self.log.info(f"{key=} not found in {self.table_name}") + value = None else: - return self.bigtable_exrtact_row_data(res) + value = self.bigtable_exrtact_row_data(res) + return value def _bigtable_get_range( self, keys: Set[bytes] @@ -463,7 +465,7 @@ def _bigtable_get_range( # First hit will return return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) # Not found - raise KeyError + return None, None def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False @@ -673,10 +675,7 @@ def _contains(self, key: bytes) -> bool: ) found = self._cache.contains(key_with_partition) if found is None: - try: - found = self._bigtable_get(key_with_partition) is not None - except KeyError: - found = False + found = self._bigtable_get(key_with_partition) is not None return found else: keys_to_search = set() @@ -688,12 +687,9 @@ def _contains(self, key: bytes) -> bool: found = self._cache.contains_any(keys_to_search) if found is None: - try: - found = ( - self._bigtable_get_range(keys_to_search)[1] is not None - ) - except KeyError: - found = False + found = ( + self._bigtable_get_range(keys_to_search)[1] is not None + ) return found except Exception as ex: self.log.error( From 0c19b9e77ec934c2d8832b0fab2539a7e8c81a47 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 16:06:05 +0100 Subject: [PATCH 228/616] fixed iterkeys --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9c7b67500..75a864c73 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -635,7 +635,8 @@ def _iterkeys(self) -> Iterator[bytes]: try: start = time.time() if self._cache._key_cache is not None: - yield from self._cache._key_cache._keys + for k in self._cache._key_cache._keys: + yield k[1:] else: for row in self._iteritems(): yield row[0] From 88c5e6dcd86afaed40a5fcb0de1273ecb1cf7107 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 8 Nov 2022 16:41:33 +0100 Subject: [PATCH 229/616] iterkeys from real table --- faust/stores/bigtable.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 75a864c73..8770aac6d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -634,12 +634,28 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: def _iterkeys(self) -> Iterator[bytes]: try: start = time.time() - if self._cache._key_cache is not None: - for k in self._cache._key_cache._keys: - yield k[1:] - else: - for row in self._iteritems(): - yield row[0] + row_set = RowSet() + for partition in self._active_partitions(): + partition_prefix = partition.to_bytes(1, "little") + start_key = b"".join([partition_prefix, self.bt_start_key]) + end_key = b"".join([partition_prefix, self.bt_end_key]) + row_set.add_row_range_from_keys(start_key, end_key) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + if self._cache._mutation_buffer is not None: + # We want to yield the mutation if any is buffered + mut_row, value = self._cache._mutation_buffer.rows.get( + row.row_key, (None, None) + ) + if value is not None: + yield row.row_key[1:] + continue + elif mut_row is not None: + # This means that row will be deleted + continue + yield row.row_key[1:] end = time.time() self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") except Exception as ex: From 7dc7ee7ee0d8c9ab2131b5fc845a00a8347e46a1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 11:07:32 +0100 Subject: [PATCH 230/616] try a hack for tables with default values --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8770aac6d..3b8a837e2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -685,6 +685,8 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True + if self.table.default is not None: + return True partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( From f31774b6d4510646b0480c105e14656f45c6ae23 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 11:10:47 +0100 Subject: [PATCH 231/616] removed border keys --- faust/stores/bigtable.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3b8a837e2..b3fd5265a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -355,7 +355,6 @@ class BigTableStore(base.SerializedStore): BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" - BT_READ_ROWS_BORDERS_KEY = "bt_read_rows_borders_key" BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" @@ -384,9 +383,6 @@ def _set_options(self, options) -> None: self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) - self.bt_start_key, self.bt_end_key = options.get( - BigTableStore.BT_READ_ROWS_BORDERS_KEY, [b"", b""] - ) self.column_name = options.get( BigTableStore.BT_COLUMN_NAME_KEY, "DATA" ) @@ -600,10 +596,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: row_set = RowSet() for partition in self._active_partitions(): - partition_prefix = partition.to_bytes(1, "little") - start_key = b"".join([partition_prefix, self.bt_start_key]) - end_key = b"".join([partition_prefix, self.bt_end_key]) - row_set.add_row_range_from_keys(start_key, end_key) + row_set.add_row_range_from_keys(partition, partition+1) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter @@ -636,10 +629,7 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() row_set = RowSet() for partition in self._active_partitions(): - partition_prefix = partition.to_bytes(1, "little") - start_key = b"".join([partition_prefix, self.bt_start_key]) - end_key = b"".join([partition_prefix, self.bt_end_key]) - row_set.add_row_range_from_keys(start_key, end_key) + row_set.add_row_range_from_keys(partition, partition+1) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter From 1c70559e70ced5693332d4813f5e20c01e16d7e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 11:39:15 +0100 Subject: [PATCH 232/616] removed MutationBuffer --- faust/stores/bigtable.py | 191 +++++---------------------------------- 1 file changed, 21 insertions(+), 170 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b3fd5265a..7e611cd65 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -36,56 +36,6 @@ def get_current_partition(): return event.message.partition -class BigtableMutationBuffer: - rows: Dict[bytes, Tuple[DirectRow, Optional[bytes]]] - mutation_limit: int - - def __init__( - self, bigtable_table: Table, mutation_freq: int, mutation_limit: int - ) -> None: - - self.mutation_freq: int = mutation_freq - self.last_flush = int(time.time()) # set to now - self.mutation_limit: int = mutation_limit - self.bigtable_table: Table = bigtable_table - self.log = logging.getLogger(self.__class__.__name__) - self.rows = {} - - def flush(self) -> None: - mutated_rows = [] - rows_to_flush = self.rows.copy().values() - for row, val in rows_to_flush: - if val is None: - row.delete() - else: - row.set_cell( - "FaustColumnFamily", - "DATA", - val, - ) - mutated_rows.append(row) - response = self.bigtable_table.mutate_rows(mutated_rows) - for (status, row) in zip(response, rows_to_flush): - if status.code != 0: - self.log.error( - "BigTableStore: BigtableMutationBuffer, " - f"Row {row[0].row_key} failed to write" - ) - else: - # Remove only rows that were successfully written - self.rows.pop(row[0].row_key, None) - - self.last_flush = int(time.time()) # set to now - - def check_flush(self) -> bool: - limit_reached = len(self.rows) >= self.mutation_limit - time_exceeded = self.last_flush + self.mutation_freq < int(time.time()) - return limit_reached or time_exceeded - - def submit(self, row: DirectRow, value: Optional[bytes] = None): - self.rows[row.row_key] = row, value - - class BigtableStartupCache: """ This is a dictionary which is only filled once, after that, every @@ -159,7 +109,6 @@ class BigTableCacheManager: _value_cache: Optional[ Union[LRUCache[bytes, Union[bytes, None]], BigtableStartupCache] ] - _mutation_buffer: Optional[BigtableMutationBuffer] _key_cache: Optional[BigTableKeyCache] def __init__(self, app, options: Dict, bt_table: Table) -> None: @@ -169,7 +118,6 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) - self._init_mutation_buffer(options, bt_table) def _fill_caches(self, partitions: Set[int]): @@ -203,17 +151,12 @@ def _fill_caches(self, partitions: Set[int]): @_register_partition def get( self, bt_key: bytes - ) -> Tuple[Optional[DirectRow], Optional[bytes]]: - row = None + ) -> Optional[bytes]: value = None - if self._mutation_buffer is not None: - row, value = self._mutation_buffer.rows.get(bt_key, (None, None)) - if row is not None: - return row, value if self._value_cache is not None: if bt_key in self._value_cache.keys(): value = self._value_cache[bt_key] - return row, value + return value def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -221,9 +164,6 @@ def get_partition(self, user_key: bytes) -> int: def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition - def get_mutation_buffer(self) -> Optional[BigtableMutationBuffer]: - return self._mutation_buffer - @_register_partition def contains(self, bt_key: bytes) -> Optional[bool]: """ @@ -237,13 +177,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: and not self._value_cache.ttl_over ): return bt_key in self._value_cache.keys() - if self._mutation_buffer is not None: - if bt_key in self._mutation_buffer.rows.keys(): - row, value = self._mutation_buffer.rows[bt_key] - if row is not None and value is not None: - return True - elif row is not None and value is None: - return False return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: @@ -257,36 +190,21 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: ): return not self._value_cache.keys().isdisjoint(key_set) - if self._mutation_buffer is not None: - keys_in_buffer = key_set.intersection( - self._mutation_buffer.rows.keys() - ) - if len(keys_in_buffer) == 1: - k = keys_in_buffer.pop() - row, value = self._mutation_buffer.rows[k] - if row is not None and value is not None: - return True - elif row is not None and value is None: - return False # No assumption possible return None @_register_partition def set( - self, bt_key: bytes, row: DirectRow, value: Optional[bytes] + self, bt_key: bytes, value: Optional[bytes] ) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value - if self._mutation_buffer is not None: - self._mutation_buffer.submit(row, value) if self._key_cache is not None: self._key_cache.add(bt_key) - def delete(self, bt_key: bytes, row: DirectRow) -> None: + def delete(self, bt_key: bytes) -> None: user_key = bt_key[1:] self._partition_cache.pop(user_key, None) - if self._mutation_buffer is not None: - self._mutation_buffer.submit(row, None) if self._value_cache is not None: del self._value_cache[bt_key] if self._key_cache is not None: @@ -322,23 +240,6 @@ def _init_key_cache(self, options: Dict): else: self._key_cache = None - def _init_mutation_buffer(self, options: Dict, bt_table: Table): - mutation_buffer_enabled = options.get( - BigTableStore.BT_ENABLE_MUTATION_BUFFER_KEY, False - ) - if mutation_buffer_enabled: - limit = options.get( - BigTableStore.BT_MUTATION_BUFFER_LIMIT_KEY, 100 - ) - freq = options.get( - BigTableStore.BT_MUTATION_BUFFER_FREQ_KEY, 30 * 60 - ) - self._mutation_buffer = BigtableMutationBuffer( - bt_table, freq, limit - ) - else: - self._mutation_buffer = None - class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -349,10 +250,7 @@ class BigTableStore(base.SerializedStore): _cache: BigTableCacheManager BT_COLUMN_NAME_KEY = "bt_column_name_key" - BT_ENABLE_MUTATION_BUFFER_KEY = "bt_enable_mutation_buffer_key" BT_INSTANCE_KEY = "bt_instance_key" - BT_MUTATION_BUFFER_FREQ_KEY = "bt_mutation_buffer_freq_key" - BT_MUTATION_BUFFER_LIMIT_KEY = "bt_mutation_buffer_limit_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_ROW_FILTERS_KEY = "bt_row_filter_key" @@ -425,14 +323,11 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - row, value = self._cache.get(key) + value = self._cache.get(key) if value is not None: return value - elif row is not None and value is None: - return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) - row = self.bt_table.direct_row(key) if res is None: self.log.info(f"{key=} not found in {self.table_name}") value = None @@ -445,11 +340,9 @@ def _bigtable_get_range( ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: for key in keys: - row, value = self._cache.get(key) + value = self._cache.get(key) if value is not None: return key, value - elif row is not None and value is None: - return key, value rows = RowSet() for key in keys: @@ -466,32 +359,21 @@ def _bigtable_get_range( def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) + row.commit() if not persist_offset: - row = self._cache.get(key)[0] - if row is None: - row = self.bt_table.direct_row(key) - if self._cache.get_mutation_buffer() is None: - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - self._cache.set(key, row, value) - else: - row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - row.commit() + self._cache.set(key, value) def _bigtable_del(self, key: bytes): row = self.bt_table.direct_row(key) - if self._cache.get_mutation_buffer() is None: - row.delete() - row.commit() - self._cache.delete(key, row) + self._cache.delete(key) + row.delete() + row.commit() def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -601,17 +483,6 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): - if self._cache._mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = self._cache._mutation_buffer.rows.get( - row.row_key, (None, None) - ) - if value is not None: - yield (row.row_key[1:], value) - continue - elif mut_row is not None: - # This means that row will be deleted - continue yield ( row.row_key[1:], self.bigtable_exrtact_row_data(row), @@ -634,17 +505,6 @@ def _iterkeys(self) -> Iterator[bytes]: for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): - if self._cache._mutation_buffer is not None: - # We want to yield the mutation if any is buffered - mut_row, value = self._cache._mutation_buffer.rows.get( - row.row_key, (None, None) - ) - if value is not None: - yield row.row_key[1:] - continue - elif mut_row is not None: - # This means that row will be deleted - continue yield row.row_key[1:] end = time.time() self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") @@ -750,19 +610,10 @@ def set_persisted_offset( we were not an active replica. """ try: - mutation_buffer = self._cache.get_mutation_buffer() - if mutation_buffer is not None and not recovery: - if mutation_buffer.check_flush(): - mutation_buffer.flush() - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), persist_offset=True - ) - else: - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), persist_offset=True - ) + offset_key = self.get_offset_key(tp).encode() + self._bigtable_set( + offset_key, str(offset).encode(), persist_offset=True + ) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From a2f47c28fc30b262b938b84cefa995ebd7e56120 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 13:07:22 +0100 Subject: [PATCH 233/616] fixed partition prefix --- faust/stores/bigtable.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7e611cd65..6392dd927 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,17 +2,7 @@ import logging import time import traceback -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - Optional, - Set, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -118,6 +108,7 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) + self.partition_prefixes: Dict[int, bytes] def _fill_caches(self, partitions: Set[int]): @@ -203,8 +194,6 @@ def set( self._key_cache.add(bt_key) def delete(self, bt_key: bytes) -> None: - user_key = bt_key[1:] - self._partition_cache.pop(user_key, None) if self._value_cache is not None: del self._value_cache[bt_key] if self._key_cache is not None: @@ -248,6 +237,7 @@ class BigTableStore(base.SerializedStore): instance: Instance bt_table: Table _cache: BigTableCacheManager + partition_prefix = b"__" BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -386,9 +376,12 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: else: return None + def _remove_partition_prefix(self, key: bytes) -> bytes: + slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) + return key[slice_from:] + def _get_key_with_partition(self, key: bytes, partition): - partition_prefix = partition.to_bytes(1, "little") - key = b"".join([partition_prefix, key]) + key = b"".join([partition, self.partition_prefix , key]) return key def _partitions_for_key(self, key: bytes) -> Iterable[int]: @@ -417,10 +410,10 @@ def _get(self, key: bytes) -> Optional[bytes]: ) keys.add(key_with_partition) - key, value = self._bigtable_get_range(keys) + key_with_partition, value = self._bigtable_get_range(keys) if value is not None: partition = key[0] - self._cache.set_partition(key[1:], partition) + self._cache.set_partition(key, partition) return value return None except KeyError as ke: @@ -457,6 +450,7 @@ def _del(self, key: bytes) -> None: key, partition=partition ) self._bigtable_del(key_with_partition) + self._cache._partition_cache.pop(key, None) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " @@ -478,13 +472,13 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: row_set = RowSet() for partition in self._active_partitions(): - row_set.add_row_range_from_keys(partition, partition+1) + row_set.add_row_range_from_keys(partition, partition + 1) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): yield ( - row.row_key[1:], + self._remove_partition_prefix(row.row_key), self.bigtable_exrtact_row_data(row), ) except Exception as ex: @@ -500,12 +494,12 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() row_set = RowSet() for partition in self._active_partitions(): - row_set.add_row_range_from_keys(partition, partition+1) + row_set.add_row_range_from_keys(partition, partition + 1) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): - yield row.row_key[1:] + yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") except Exception as ex: From b1268025945b04512846543fa36dc13c9a55aca0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 13:11:51 +0100 Subject: [PATCH 234/616] fixed read rows with new partition prefix --- faust/stores/bigtable.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6392dd927..23cda8b61 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -376,6 +376,9 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: else: return None + def _get_partition_prefix(self, partition: int) -> bytes: + return b"".join([partition, self.partition_prefix]) + def _remove_partition_prefix(self, key: bytes) -> bytes: slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) return key[slice_from:] @@ -472,7 +475,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: row_set = RowSet() for partition in self._active_partitions(): - row_set.add_row_range_from_keys(partition, partition + 1) + prefix = self._get_partition_prefix(partition) + row_set.add_row_range_with_prefix(prefix) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter @@ -494,7 +498,8 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() row_set = RowSet() for partition in self._active_partitions(): - row_set.add_row_range_from_keys(partition, partition + 1) + prefix = self._get_partition_prefix(partition) + row_set.add_row_range_with_prefix(prefix) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter From 6d4f671523b6946caaacb2c1f231ed25fc4d223d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 15:09:05 +0100 Subject: [PATCH 235/616] fixed get partition call --- faust/stores/bigtable.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 23cda8b61..d05ceb749 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -377,14 +377,16 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: return None def _get_partition_prefix(self, partition: int) -> bytes: - return b"".join([partition, self.partition_prefix]) + partition_bytes = partition.to_bytes(1, "little") + return b"".join([partition_bytes, self.partition_prefix]) def _remove_partition_prefix(self, key: bytes) -> bytes: slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) return key[slice_from:] - def _get_key_with_partition(self, key: bytes, partition): - key = b"".join([partition, self.partition_prefix , key]) + def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: + prefix = self._get_partition_prefix(partition) + key = b"".join([prefix, key]) return key def _partitions_for_key(self, key: bytes) -> Iterable[int]: From 0aa53f725cdca8923dbeefda80f239dff3fa99db Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 16:07:39 +0100 Subject: [PATCH 236/616] fix rowsets --- faust/stores/bigtable.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d05ceb749..571664bdb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -477,8 +477,9 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: row_set = RowSet() for partition in self._active_partitions(): - prefix = self._get_partition_prefix(partition) - row_set.add_row_range_with_prefix(prefix) + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter @@ -500,8 +501,9 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() row_set = RowSet() for partition in self._active_partitions(): - prefix = self._get_partition_prefix(partition) - row_set.add_row_range_with_prefix(prefix) + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter From b1253bec14da5932192442a99ff355eda63231d4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 11 Nov 2022 16:27:39 +0100 Subject: [PATCH 237/616] removed log in missing --- faust/tables/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/tables/table.py b/faust/tables/table.py index 548712709..a38d32859 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -52,7 +52,6 @@ def tumbling( ) def __missing__(self, key: KT) -> VT: - self.log.info(f"BigTableStore: __missing__ called for {key=}") if self.default is not None: return self.default() raise KeyError(key) From 04f7a0d3c035819778db03cfb3bb3312507785d8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 07:51:27 +0100 Subject: [PATCH 238/616] removed direct rows --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 571664bdb..16ed7c67c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,7 +7,6 @@ from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance -from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet From 08a8d1634972a8ce3202238396b74b5cb7646c03 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 11:05:36 +0100 Subject: [PATCH 239/616] fixed wrong changelog read --- faust/stores/bigtable.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 16ed7c67c..ad60e9b17 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -656,9 +656,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - key: bytes = msg.key - partition_bytes = tp.partition.to_bytes(1, "little") - offset_key = b"".join([partition_bytes, key]) + offset_key = self._get_key_with_partition(msg.key, partition=tp.partition) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() From 6ad9a7ef90e15c765b21ed22c39bbe9a7ba68b1f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 13:44:01 +0100 Subject: [PATCH 240/616] fixed get if keyerror --- faust/stores/bigtable.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ad60e9b17..ce14b6615 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -416,7 +416,7 @@ def _get(self, key: bytes) -> Optional[bytes]: key_with_partition, value = self._bigtable_get_range(keys) if value is not None: - partition = key[0] + partition = key_with_partition[0] self._cache.set_partition(key, partition) return value return None @@ -537,8 +537,6 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - if self.table.default is not None: - return True partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( From 6fac7e3d5574c45ba55c1f6cfa5f87309de871e0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 14:49:04 +0100 Subject: [PATCH 241/616] try to fix contains --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ce14b6615..f7a628a1c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -556,6 +556,8 @@ def _contains(self, key: bytes) -> bool: found = self._cache.contains_any(keys_to_search) if found is None: + if self.table.default is not None: + return True found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) From 37cebf049022a5702f80e3281b1049ea25afacc4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 14:50:44 +0100 Subject: [PATCH 242/616] add default handling in contains --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f7a628a1c..b3552d155 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -544,6 +544,8 @@ def _contains(self, key: bytes) -> bool: ) found = self._cache.contains(key_with_partition) if found is None: + if self.table.default is not None: + return True found = self._bigtable_get(key_with_partition) is not None return found else: From 27160e139dea2ec43822102be225906a277d1a46 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 14:57:45 +0100 Subject: [PATCH 243/616] print in missing --- faust/tables/table.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/tables/table.py b/faust/tables/table.py index a38d32859..970ea1423 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -53,7 +53,9 @@ def tumbling( def __missing__(self, key: KT) -> VT: if self.default is not None: + self.log.info(f"BigTableStore: Will return default in __missing__ -> {key=}") return self.default() + self.log.info(f"BigTableStore: Will raise KeyError in __missing__ -> {key=}") raise KeyError(key) def _has_key(self, key: KT) -> bool: From 0a14f0458907d11dc14e5561116bcf3f719858bf Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 15:33:37 +0100 Subject: [PATCH 244/616] different _contains abroach --- faust/stores/bigtable.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b3552d155..d4e201494 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -535,7 +535,7 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - if not self.app.conf.store_check_exists: + if not self.app.conf.store_check_exists or self.table.default is not None: return True partition = self._maybe_get_partition_from_message() if partition is not None: @@ -544,8 +544,6 @@ def _contains(self, key: bytes) -> bool: ) found = self._cache.contains(key_with_partition) if found is None: - if self.table.default is not None: - return True found = self._bigtable_get(key_with_partition) is not None return found else: @@ -558,8 +556,6 @@ def _contains(self, key: bytes) -> bool: found = self._cache.contains_any(keys_to_search) if found is None: - if self.table.default is not None: - return True found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) From c07e5827963a68ae0c79265cc7d731b89c002e7b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 15:52:00 +0100 Subject: [PATCH 245/616] only search bigtable in contains --- faust/stores/bigtable.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d4e201494..5a46ad5f8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -535,14 +535,15 @@ def _size(self) -> int: def _contains(self, key: bytes) -> bool: try: - if not self.app.conf.store_check_exists or self.table.default is not None: + if not self.app.conf.store_check_exists: return True partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_key_with_partition( key, partition=partition ) - found = self._cache.contains(key_with_partition) + # found = self._cache.contains(key_with_partition) + found = None if found is None: found = self._bigtable_get(key_with_partition) is not None return found @@ -554,7 +555,8 @@ def _contains(self, key: bytes) -> bool: ) keys_to_search.add(key_with_partition) - found = self._cache.contains_any(keys_to_search) + # found = self._cache.contains_any(keys_to_search) + found = None if found is None: found = ( self._bigtable_get_range(keys_to_search)[1] is not None From 2c23d83af5d11300e3c31df38912574bc2071b45 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 16:27:05 +0100 Subject: [PATCH 246/616] enabel key cache but only return true if found --- faust/stores/bigtable.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5a46ad5f8..77d93752a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -542,9 +542,8 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - # found = self._cache.contains(key_with_partition) - found = None - if found is None: + found = self._cache.contains(key_with_partition) + if found is not True: found = self._bigtable_get(key_with_partition) is not None return found else: @@ -555,9 +554,8 @@ def _contains(self, key: bytes) -> bool: ) keys_to_search.add(key_with_partition) - # found = self._cache.contains_any(keys_to_search) - found = None - if found is None: + found = self._cache.contains_any(keys_to_search) + if found is not True: found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) From 1332cdd22bc19ca3989eed98eeb86017faf83f2f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 14 Nov 2022 16:46:04 +0100 Subject: [PATCH 247/616] removed logs from missing --- faust/tables/table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/tables/table.py b/faust/tables/table.py index 970ea1423..a38d32859 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -53,9 +53,7 @@ def tumbling( def __missing__(self, key: KT) -> VT: if self.default is not None: - self.log.info(f"BigTableStore: Will return default in __missing__ -> {key=}") return self.default() - self.log.info(f"BigTableStore: Will raise KeyError in __missing__ -> {key=}") raise KeyError(key) def _has_key(self, key: KT) -> bool: From 168408483712fda433d10ad2734274aac0f6ecdb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 07:39:27 +0100 Subject: [PATCH 248/616] added logs --- faust/stores/bigtable.py | 4 ++++ faust/tables/table.py | 1 + 2 files changed, 5 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 77d93752a..f5e4ab476 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -543,8 +543,10 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) found = self._cache.contains(key_with_partition) + self.log.info(f" [{self.table_name}] Key {key} in keycache: {found}") if found is not True: found = self._bigtable_get(key_with_partition) is not None + self.log.info(f" [{self.table_name}] Key {key} in table: {found}") return found else: keys_to_search = set() @@ -555,10 +557,12 @@ def _contains(self, key: bytes) -> bool: keys_to_search.add(key_with_partition) found = self._cache.contains_any(keys_to_search) + self.log.info(f" [{self.table_name}] Key {key} in keycache: {found} (ALL PARTITIONS)") if found is not True: found = ( self._bigtable_get_range(keys_to_search)[1] is not None ) + self.log.info(f" [{self.table_name}] Key {key} in table: {found} (ALL PARTITIONS)") return found except Exception as ex: self.log.error( diff --git a/faust/tables/table.py b/faust/tables/table.py index a38d32859..52a293f93 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -53,6 +53,7 @@ def tumbling( def __missing__(self, key: KT) -> VT: if self.default is not None: + self.log.info(f" Key {key} will return default in __missing__") return self.default() raise KeyError(key) From 1ddf095fe2aba68c331e139b28e43efa603cb90f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 09:19:33 +0100 Subject: [PATCH 249/616] add missing keys to key_cache --- faust/stores/bigtable.py | 105 ++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f5e4ab476..140ef579f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,7 +2,17 @@ import logging import time import traceback -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Set, + Tuple, + Union, +) from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -72,16 +82,24 @@ def keys(self): class BigTableKeyCache: - _keys: Set[bytes] = set() + existing_keys: Set[bytes] = set() + missing_keys: Set[bytes] = set() def add(self, key: bytes): - self._keys.add(key) + self.existing_keys.add(key) + self.missing_keys.discard(key) def discard(self, key: bytes): - self._keys.discard(key) - - def exists(self, key: bytes) -> bool: - return key in self._keys + self.existing_keys.discard(key) + self.missing_keys.add(key) + + def exists(self, key: bytes) -> Optional[bool]: + if key in self.existing_keys: + return True + elif key in self.missing_keys: + return False + else: + return None def _register_partition(func): @@ -109,7 +127,9 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_key_cache(options) self.partition_prefixes: Dict[int, bytes] - def _fill_caches(self, partitions: Set[int]): + def _fill_value_cache(self, partitions: Set[int]): + if self._value_cache is None: + return partitions = partitions.difference(self._registered_partitions) if len(partitions) == 0: @@ -129,8 +149,6 @@ def _fill_caches(self, partitions: Set[int]): if isinstance(self._value_cache, BigtableStartupCache): row_val = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache.data[row.row_key] = row_val - if self._key_cache is not None: - self._key_cache.add(row.row_key) self._registered_partitions.update(partitions) td = time.time() - start self.log.info( @@ -139,9 +157,7 @@ def _fill_caches(self, partitions: Set[int]): ) @_register_partition - def get( - self, bt_key: bytes - ) -> Optional[bytes]: + def get(self, bt_key: bytes) -> Optional[bytes]: value = None if self._value_cache is not None: if bt_key in self._value_cache.keys(): @@ -154,7 +170,6 @@ def get_partition(self, user_key: bytes) -> int: def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition - @_register_partition def contains(self, bt_key: bytes) -> Optional[bool]: """ If we return None here, this means, that no assumption @@ -162,31 +177,28 @@ def contains(self, bt_key: bytes) -> Optional[bool]: """ if self._key_cache is not None: return self._key_cache.exists(bt_key) - if ( - isinstance(self._value_cache, BigtableStartupCache) - and not self._value_cache.ttl_over - ): - return bt_key in self._value_cache.keys() return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - partitions = {k[0] for k in key_set} - self._fill_caches(partitions) if self._key_cache is not None: - return not self._key_cache._keys.isdisjoint(key_set) - if ( - isinstance(self._value_cache, BigtableStartupCache) - and not self._value_cache.ttl_over - ): - return not self._value_cache.keys().isdisjoint(key_set) + if not self._key_cache.existing_keys.isdisjoint(key_set): + return True + if not self._key_cache.missing_keys.isdisjoint(key_set): + return False # No assumption possible return None + def add_key(self, bt_key: bytes): + if self._key_cache is not None: + self._key_cache.add(bt_key) + + def discard_key(self, bt_key: bytes): + if self._key_cache is not None: + self._key_cache.discard(bt_key) + @_register_partition - def set( - self, bt_key: bytes, value: Optional[bytes] - ) -> None: + def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value if self._key_cache is not None: @@ -380,7 +392,9 @@ def _get_partition_prefix(self, partition: int) -> bytes: return b"".join([partition_bytes, self.partition_prefix]) def _remove_partition_prefix(self, key: bytes) -> bytes: - slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) + slice_from = key.find(self.partition_prefix) + len( + self.partition_prefix + ) return key[slice_from:] def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: @@ -509,7 +523,9 @@ def _iterkeys(self) -> Iterator[bytes]: ): yield self._remove_partition_prefix(row.row_key) end = time.time() - self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") + self.log.info( + f"Finished iterkeys for {self.table_name} in {end - start}s" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -543,10 +559,12 @@ def _contains(self, key: bytes) -> bool: key, partition=partition ) found = self._cache.contains(key_with_partition) - self.log.info(f" [{self.table_name}] Key {key} in keycache: {found}") - if found is not True: + if found is None: found = self._bigtable_get(key_with_partition) is not None - self.log.info(f" [{self.table_name}] Key {key} in table: {found}") + if found is True: + self._cache.add_key(key_with_partition) + else: + self._cache.discard_key(key_with_partition) return found else: keys_to_search = set() @@ -557,12 +575,15 @@ def _contains(self, key: bytes) -> bool: keys_to_search.add(key_with_partition) found = self._cache.contains_any(keys_to_search) - self.log.info(f" [{self.table_name}] Key {key} in keycache: {found} (ALL PARTITIONS)") if found is not True: - found = ( - self._bigtable_get_range(keys_to_search)[1] is not None - ) - self.log.info(f" [{self.table_name}] Key {key} in table: {found} (ALL PARTITIONS)") + bt_key, value = self._bigtable_get_range(keys_to_search) + if value is not None: + self._cache.add_key(bt_key) + found = True + else: + for k in keys_to_search: + self._cache.discard_key(k) + found = False return found except Exception as ex: self.log.error( @@ -658,7 +679,9 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_key_with_partition(msg.key, partition=tp.partition) + offset_key = self._get_key_with_partition( + msg.key, partition=tp.partition + ) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() From 22fa51264b1ee65a959bfccb3a8906851a636933 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 09:20:05 +0100 Subject: [PATCH 250/616] removed log --- faust/tables/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/tables/table.py b/faust/tables/table.py index 52a293f93..a38d32859 100644 --- a/faust/tables/table.py +++ b/faust/tables/table.py @@ -53,7 +53,6 @@ def tumbling( def __missing__(self, key: KT) -> VT: if self.default is not None: - self.log.info(f" Key {key} will return default in __missing__") return self.default() raise KeyError(key) From b1b9b60417cb030aa56efa3fff9939443eac71a0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 09:41:55 +0100 Subject: [PATCH 251/616] fixed wrapper --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 140ef579f..2721bff9f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -105,7 +105,7 @@ def exists(self, key: bytes) -> Optional[bool]: def _register_partition(func): def inner(self, bt_key: bytes, *args): partition = bt_key[0] - self._fill_caches({partition}) + self._fill_value_cache({partition}) return func(self, bt_key, *args) return inner From 2d759862cb555fc5b3480d1a87df5dd56b5340b7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 10:54:14 +0100 Subject: [PATCH 252/616] adjusted caches --- faust/stores/bigtable.py | 129 +++++++++++++++------------------------ 1 file changed, 50 insertions(+), 79 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2721bff9f..963c4f673 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -35,15 +35,19 @@ def get_current_partition(): return event.message.partition -class BigtableStartupCache: +class BigTableValueCache: """ This is a dictionary which is only filled once, after that, every successful access to a key, will remove it. """ + data: Union[Dict, LRUCache] - def __init__(self, ttl: Optional[int]) -> None: + def __init__(self, ttl: Optional[int], size: Optional[int]) -> None: self.log = logging.getLogger(self.__class__.__name__) - self.data: Dict = {} + if size is not None: + self.data = LRUCache(limit=size) + else: + self.data = dict() self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) @@ -52,7 +56,7 @@ def __len__(self): return len(self.data) def __getitem__(self, key): - if self.ttl is not None: + if self.ttl is not None or self.ttl == -1: res = self.data[key] self._maybe_ttl_clear() return res @@ -67,7 +71,7 @@ def __delitem__(self, key): self.data.pop(key, None) def _maybe_ttl_clear(self): - if self.ttl is not None: + if self.ttl is not None and self.ttl != -1: now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} @@ -102,61 +106,21 @@ def exists(self, key: bytes) -> Optional[bool]: return None -def _register_partition(func): - def inner(self, bt_key: bytes, *args): - partition = bt_key[0] - self._fill_value_cache({partition}) - return func(self, bt_key, *args) - - return inner - - class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[ - Union[LRUCache[bytes, Union[bytes, None]], BigtableStartupCache] + Union[LRUCache[bytes, Union[bytes, None]], BigTableValueCache] ] _key_cache: Optional[BigTableKeyCache] def __init__(self, app, options: Dict, bt_table: Table) -> None: self.log = logging.getLogger(__name__) - self._registered_partitions = set() self.bt_table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) self.partition_prefixes: Dict[int, bytes] - def _fill_value_cache(self, partitions: Set[int]): - if self._value_cache is None: - return - - partitions = partitions.difference(self._registered_partitions) - if len(partitions) == 0: - return # Nothing todo - start = time.time() - row_set = RowSet() - for p in partitions: - row_set.add_row_range_with_prefix(chr(p)) - - for row in self.bt_table.read_rows( - row_set=row_set, - filter_=CellsColumnLimitFilter(1), - retry=DEFAULT_RETRY_READ_ROWS.with_deadline( - 10 * 60 - ), # High deadline cause slow - ): - if isinstance(self._value_cache, BigtableStartupCache): - row_val = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache.data[row.row_key] = row_val - self._registered_partitions.update(partitions) - td = time.time() - start - self.log.info( - f"BigTableStore: filled cache for {self.bt_table.table_id}:" - f"{partitions=} in {td}s" - ) - - @_register_partition def get(self, bt_key: bytes) -> Optional[bytes]: value = None if self._value_cache is not None: @@ -164,6 +128,18 @@ def get(self, bt_key: bytes) -> Optional[bytes]: value = self._value_cache[bt_key] return value + def set(self, bt_key: bytes, value: Optional[bytes]) -> None: + if self._value_cache is not None: + self._value_cache[bt_key] = value + if self._key_cache is not None: + self._key_cache.add(bt_key) + + def delete(self, bt_key: bytes) -> None: + if self._value_cache is not None: + del self._value_cache[bt_key] + if self._key_cache is not None: + self._key_cache.discard(bt_key) + def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -185,7 +161,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return True if not self._key_cache.missing_keys.isdisjoint(key_set): return False - # No assumption possible return None @@ -196,23 +171,12 @@ def add_key(self, bt_key: bytes): def discard_key(self, bt_key: bytes): if self._key_cache is not None: self._key_cache.discard(bt_key) - - @_register_partition - def set(self, bt_key: bytes, value: Optional[bytes]) -> None: - if self._value_cache is not None: - self._value_cache[bt_key] = value - if self._key_cache is not None: - self._key_cache.add(bt_key) - - def delete(self, bt_key: bytes) -> None: if self._value_cache is not None: del self._value_cache[bt_key] - if self._key_cache is not None: - self._key_cache.discard(bt_key) def _init_value_cache( self, options - ) -> Optional[Union[LRUCache, BigtableStartupCache]]: + ) -> Optional[Union[LRUCache, BigTableValueCache]]: value_cache_type = options.get( BigTableStore.VALUE_CACHE_TYPE_KEY, None ) @@ -221,7 +185,7 @@ def _init_value_cache( startup_cache_ttl = options.get( BigTableStore.STARTUPCACHE_TTL_KEY, None ) - self._value_cache = BigtableStartupCache(startup_cache_ttl) + self._value_cache = BigTableValueCache(startup_cache_ttl) elif value_cache_type == "forever": value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, 1_000 @@ -336,6 +300,30 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = self.bigtable_exrtact_row_data(res) return value + def _bigtable_contains(self, key: bytes) -> bool: + cache_res = self._cache.contains(key) + if cache_res is not None: + return cache_res + row = self.bt_table.read_row(key, filter_=self.row_filter) + if row is not None: + self._cache.set(key, self.bigtable_exrtact_row_data(row)) + return True + self._cache.discard_key(key) + return False + + def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: + rows = RowSet() + for key in keys: + rows.add_row_key(key) + + for row in self.bt_table.read_rows( + row_set=rows, filter_=CellsColumnLimitFilter(1) + ): + # First hit will return + self._cache.set(row.row_key, self.bigtable_exrtact_row_data(row)) + return True + return False + def _bigtable_get_range( self, keys: Set[bytes] ) -> Tuple[Optional[bytes], Optional[bytes]]: @@ -558,14 +546,7 @@ def _contains(self, key: bytes) -> bool: key_with_partition = self._get_key_with_partition( key, partition=partition ) - found = self._cache.contains(key_with_partition) - if found is None: - found = self._bigtable_get(key_with_partition) is not None - if found is True: - self._cache.add_key(key_with_partition) - else: - self._cache.discard_key(key_with_partition) - return found + return self._bigtable_contains(key_with_partition) else: keys_to_search = set() for partition in self._partitions_for_key(key): @@ -574,17 +555,7 @@ def _contains(self, key: bytes) -> bool: ) keys_to_search.add(key_with_partition) - found = self._cache.contains_any(keys_to_search) - if found is not True: - bt_key, value = self._bigtable_get_range(keys_to_search) - if value is not None: - self._cache.add_key(bt_key) - found = True - else: - for k in keys_to_search: - self._cache.discard_key(k) - found = False - return found + return self._bigtable_contains_any(keys_to_search) except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From 47a68168f3c05f6e03fd236001ec23768f0fae72 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 11:03:30 +0100 Subject: [PATCH 253/616] adjusted value cache --- faust/stores/bigtable.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 963c4f673..613e0532b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -40,14 +40,15 @@ class BigTableValueCache: This is a dictionary which is only filled once, after that, every successful access to a key, will remove it. """ + data: Union[Dict, LRUCache] - def __init__(self, ttl: Optional[int], size: Optional[int]) -> None: + def __init__(self, ttl=-1, size: Optional[int] = None) -> None: self.log = logging.getLogger(self.__class__.__name__) if size is not None: self.data = LRUCache(limit=size) else: - self.data = dict() + self.data = {} self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) @@ -56,7 +57,7 @@ def __len__(self): return len(self.data) def __getitem__(self, key): - if self.ttl is not None or self.ttl == -1: + if not self.ttl_over: res = self.data[key] self._maybe_ttl_clear() return res @@ -64,18 +65,17 @@ def __getitem__(self, key): def __setitem__(self, key, value) -> None: if value is not None: self._maybe_ttl_clear() - if self.ttl is not None: + if not self.ttl_over: self.data[key] = value def __delitem__(self, key): self.data.pop(key, None) def _maybe_ttl_clear(self): - if self.ttl is not None and self.ttl != -1: + if self.ttl != -1: now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} - self.ttl = None self.log.info( "BigTableStore: Cleard startupcache because TTL is over" ) @@ -164,16 +164,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # No assumption possible return None - def add_key(self, bt_key: bytes): - if self._key_cache is not None: - self._key_cache.add(bt_key) - - def discard_key(self, bt_key: bytes): - if self._key_cache is not None: - self._key_cache.discard(bt_key) - if self._value_cache is not None: - del self._value_cache[bt_key] - def _init_value_cache( self, options ) -> Optional[Union[LRUCache, BigTableValueCache]]: @@ -185,13 +175,15 @@ def _init_value_cache( startup_cache_ttl = options.get( BigTableStore.STARTUPCACHE_TTL_KEY, None ) - self._value_cache = BigTableValueCache(startup_cache_ttl) + self._value_cache = BigTableValueCache(ttl=startup_cache_ttl) elif value_cache_type == "forever": value_cache_size = options.get( BigTableStore.VALUE_CACHE_SIZE_KEY, 1_000 ) - self._value_cache = LRUCache(limit=value_cache_size) + self._value_cache = BigTableValueCache( + ttl=-1, size=value_cache_size + ) else: self._value_cache = None @@ -308,7 +300,7 @@ def _bigtable_contains(self, key: bytes) -> bool: if row is not None: self._cache.set(key, self.bigtable_exrtact_row_data(row)) return True - self._cache.discard_key(key) + self._cache.delete(key) return False def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: From 0c63836c1f612a30c8753349ddb26460d7132ccb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 11:13:25 +0100 Subject: [PATCH 254/616] removed key cache --- faust/stores/bigtable.py | 58 +++++++++++----------------------------- 1 file changed, 15 insertions(+), 43 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 613e0532b..defe49abf 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -85,33 +85,12 @@ def keys(self): return self.data.keys() -class BigTableKeyCache: - existing_keys: Set[bytes] = set() - missing_keys: Set[bytes] = set() - - def add(self, key: bytes): - self.existing_keys.add(key) - self.missing_keys.discard(key) - - def discard(self, key: bytes): - self.existing_keys.discard(key) - self.missing_keys.add(key) - - def exists(self, key: bytes) -> Optional[bool]: - if key in self.existing_keys: - return True - elif key in self.missing_keys: - return False - else: - return None - - class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[ Union[LRUCache[bytes, Union[bytes, None]], BigTableValueCache] ] - _key_cache: Optional[BigTableKeyCache] + _key_cache: Optional[Set] def __init__(self, app, options: Dict, bt_table: Table) -> None: self.log = logging.getLogger(__name__) @@ -152,38 +131,30 @@ def contains(self, bt_key: bytes) -> Optional[bool]: about the current key can be made. """ if self._key_cache is not None: - return self._key_cache.exists(bt_key) + return bt_key in self._key_cache return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if self._key_cache is not None: - if not self._key_cache.existing_keys.isdisjoint(key_set): + if not self._key_cache.isdisjoint(key_set): return True - if not self._key_cache.missing_keys.isdisjoint(key_set): - return False # No assumption possible return None def _init_value_cache( self, options ) -> Optional[Union[LRUCache, BigTableValueCache]]: - value_cache_type = options.get( - BigTableStore.VALUE_CACHE_TYPE_KEY, None + enable = options.get( + BigTableStore.VALUE_CACHE_ENABLE_KEY, False ) - - if value_cache_type == "startup": - startup_cache_ttl = options.get( - BigTableStore.STARTUPCACHE_TTL_KEY, None + if enable: + ttl = options.get( + BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 ) - self._value_cache = BigTableValueCache(ttl=startup_cache_ttl) - elif value_cache_type == "forever": - value_cache_size = options.get( - BigTableStore.VALUE_CACHE_SIZE_KEY, 1_000 - ) - - self._value_cache = BigTableValueCache( - ttl=-1, size=value_cache_size + size = options.get( + BigTableStore.VALUE_CACHE_SIZE_KEY, None ) + self._value_cache = BigTableValueCache(ttl=ttl, size=size) else: self._value_cache = None @@ -192,7 +163,7 @@ def _init_key_cache(self, options: Dict): BigTableStore.KEY_CACHE_ENABLE_KEY, False ) if key_cache_enabled: - self._key_cache = BigTableKeyCache() + self._key_cache = set() else: self._key_cache = None @@ -213,9 +184,9 @@ class BigTableStore(base.SerializedStore): BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" - STARTUPCACHE_TTL_KEY = "startupcache_ttl_key" + VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" - VALUE_CACHE_TYPE_KEY = "value_cache_type_key" + VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" def __init__( self, @@ -300,6 +271,7 @@ def _bigtable_contains(self, key: bytes) -> bool: if row is not None: self._cache.set(key, self.bigtable_exrtact_row_data(row)) return True + # Just to be sure self._cache.delete(key) return False From c4f3c7dd22e4b2d855cd30e118a18f348b43028e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 11:17:40 +0100 Subject: [PATCH 255/616] fixed caching --- faust/stores/bigtable.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index defe49abf..8196ac7b5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -144,16 +144,12 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def _init_value_cache( self, options ) -> Optional[Union[LRUCache, BigTableValueCache]]: - enable = options.get( - BigTableStore.VALUE_CACHE_ENABLE_KEY, False - ) + enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) if enable: ttl = options.get( BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 ) - size = options.get( - BigTableStore.VALUE_CACHE_SIZE_KEY, None - ) + size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) else: self._value_cache = None @@ -181,7 +177,6 @@ class BigTableStore(base.SerializedStore): BT_INSTANCE_KEY = "bt_instance_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" - BT_ROW_FILTERS_KEY = "bt_row_filter_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" @@ -212,9 +207,7 @@ def _set_options(self, options) -> None: self.column_name = options.get( BigTableStore.BT_COLUMN_NAME_KEY, "DATA" ) - self.row_filter = options.get( - BigTableStore.BT_ROW_FILTERS_KEY, CellsColumnLimitFilter(1) - ) + self.row_filter = CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" ) @@ -261,6 +254,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = None else: value = self.bigtable_exrtact_row_data(res) + self._cache.set(key, value) return value def _bigtable_contains(self, key: bytes) -> bool: @@ -305,7 +299,10 @@ def _bigtable_get_range( row_set=rows, filter_=CellsColumnLimitFilter(1) ): # First hit will return - return row.row_key, BigTableStore.bigtable_exrtact_row_data(row) + val = self.bigtable_exrtact_row_data(row) + self._cache.set(row.row_key, val) + return row.row_key, val + # Not found return None, None From cf6f55cbe766bc11209ef437d4084425d393d119 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 12:24:21 +0100 Subject: [PATCH 256/616] insert in cache in iterkeys --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8196ac7b5..6948bd942 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -258,9 +258,6 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return value def _bigtable_contains(self, key: bytes) -> bool: - cache_res = self._cache.contains(key) - if cache_res is not None: - return cache_res row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: self._cache.set(key, self.bigtable_exrtact_row_data(row)) @@ -470,6 +467,8 @@ def _iterkeys(self) -> Iterator[bytes]: for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): + if self._cache._value_cache is not None: + self._cache.set(row.row_key, self.bigtable_exrtact_row_data(row)) yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info( From 2f4e117fd54fb97ad00d5cf719e020928ea783b2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 12:37:16 +0100 Subject: [PATCH 257/616] add contains again --- faust/stores/bigtable.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6948bd942..dc27d6d43 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -131,13 +131,20 @@ def contains(self, bt_key: bytes) -> Optional[bool]: about the current key can be made. """ if self._key_cache is not None: - return bt_key in self._key_cache + if bt_key in self._key_cache: + return True + if self._value_cache is not None: + if bt_key in self._value_cache: + return True return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if self._key_cache is not None: if not self._key_cache.isdisjoint(key_set): return True + if self._value_cache is not None: + if not self._value_cache.keys().isdisjoint(key_set): + return True # No assumption possible return None @@ -158,8 +165,10 @@ def _init_key_cache(self, options: Dict): key_cache_enabled = options.get( BigTableStore.KEY_CACHE_ENABLE_KEY, False ) - if key_cache_enabled: - self._key_cache = set() + # We don't need a key cache if we use a value cache already + if self._value_cache is None and key_cache_enabled: + if key_cache_enabled: + self._key_cache = set() else: self._key_cache = None @@ -258,6 +267,9 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return value def _bigtable_contains(self, key: bytes) -> bool: + if self._cache.contains(key) is True: + # Never risk, false negatives + return True row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: self._cache.set(key, self.bigtable_exrtact_row_data(row)) @@ -268,6 +280,9 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: rows = RowSet() + if self._cache.contains_any(keys) is True: + # Never risk, false negatives + return True for key in keys: rows.add_row_key(key) From b099b9752f00a4957d479168a8aab07c73cc51aa Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 15 Nov 2022 13:40:47 +0100 Subject: [PATCH 258/616] fixed wrong cache access --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index dc27d6d43..00fc23357 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -134,7 +134,7 @@ def contains(self, bt_key: bytes) -> Optional[bool]: if bt_key in self._key_cache: return True if self._value_cache is not None: - if bt_key in self._value_cache: + if bt_key in self._value_cache.keys(): return True return None From 90a5d50743f4960bb5f5f2f229d1f84ce8d8f13b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 16 Nov 2022 12:38:50 +0100 Subject: [PATCH 259/616] removed log if not needed --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 00fc23357..f4691b8ba 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -72,7 +72,7 @@ def __delitem__(self, key): self.data.pop(key, None) def _maybe_ttl_clear(self): - if self.ttl != -1: + if self.ttl != -1 and self.ttl_over is False: now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} From 6dfd2054dceba71203b305a821732a77f232fdd9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 09:58:29 +0100 Subject: [PATCH 260/616] try to use cache only --- faust/stores/bigtable.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f4691b8ba..9b73d4bd8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -17,7 +17,6 @@ from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance -from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table @@ -87,9 +86,7 @@ def keys(self): class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] - _value_cache: Optional[ - Union[LRUCache[bytes, Union[bytes, None]], BigTableValueCache] - ] + _value_cache: Optional[BigTableValueCache] _key_cache: Optional[Set] def __init__(self, app, options: Dict, bt_table: Table) -> None: @@ -158,6 +155,10 @@ def _init_value_cache( ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) + # We should optimize here to load only values of active partitions + for row in self.bt_table.read_rows(b"\x00__", b"\x14__"): + value = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache[row.row_key] = value else: self._value_cache = None From 3ce2b6d2c6ca6f16040f268b1f1a2e984f7f4e64 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 10:32:15 +0100 Subject: [PATCH 261/616] added cache only mode --- faust/stores/bigtable.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9b73d4bd8..07e1d30b8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -96,9 +96,37 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_value_cache(options) self._init_key_cache(options) self.partition_prefixes: Dict[int, bytes] + self._filled_partitions: Set[int] = set() + + def _fill_if_empty(self, bt_keys: Set[bytes]): + partitions = set() + for k in bt_keys: + partitions.add(k[0]) + partitions_to_fill = partitions.difference(self._filled_partitions) + + # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION + row_set = RowSet() + for partition in partitions_to_fill: + row_set.add_row_range_from_keys( + start_key=chr(partition), end_key=chr(partition + 1) + ) + + if self._value_cache is not None: + for row in self.bt_table.read_rows( + row_set=row_set, filter_=CellsColumnLimitFilter(1) + ): + value = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache[row.row_key] = value + elif self._key_cache is not None: + for row in self.bt_table.read_rows( + row_set=row_set, filter_=CellsColumnLimitFilter(1) + ): + self._key_cache.add(row.row_key) + self._filled_partitions.add(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: value = None + self._fill_if_empty({bt_key}) if self._value_cache is not None: if bt_key in self._value_cache.keys(): value = self._value_cache[bt_key] @@ -123,6 +151,7 @@ def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition def contains(self, bt_key: bytes) -> Optional[bool]: + self._fill_if_empty({bt_key}) """ If we return None here, this means, that no assumption about the current key can be made. @@ -136,6 +165,7 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + self._fill_if_empty(key_set) if self._key_cache is not None: if not self._key_cache.isdisjoint(key_set): return True @@ -155,10 +185,6 @@ def _init_value_cache( ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) - # We should optimize here to load only values of active partitions - for row in self.bt_table.read_rows(b"\x00__", b"\x14__"): - value = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache[row.row_key] = value else: self._value_cache = None @@ -484,7 +510,9 @@ def _iterkeys(self) -> Iterator[bytes]: row_set=row_set, filter_=self.row_filter ): if self._cache._value_cache is not None: - self._cache.set(row.row_key, self.bigtable_exrtact_row_data(row)) + self._cache.set( + row.row_key, self.bigtable_exrtact_row_data(row) + ) yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info( From f3563a880c5428d7198da1c129837a4c9a91d41f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 10:32:40 +0100 Subject: [PATCH 262/616] fixed add --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 07e1d30b8..c45945734 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -122,7 +122,7 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): row_set=row_set, filter_=CellsColumnLimitFilter(1) ): self._key_cache.add(row.row_key) - self._filled_partitions.add(partitions_to_fill) + self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: value = None From 0da4df56da2037e572d784329a3e7b4939609e2a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 11:40:08 +0100 Subject: [PATCH 263/616] added logging --- faust/stores/bigtable.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c45945734..f6ec98c09 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,17 +2,7 @@ import logging import time import traceback -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - Optional, - Set, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -91,7 +81,7 @@ class BigTableCacheManager: def __init__(self, app, options: Dict, bt_table: Table) -> None: self.log = logging.getLogger(__name__) - self.bt_table = bt_table + self.bt_table: Table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) @@ -103,14 +93,18 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): for k in bt_keys: partitions.add(k[0]) partitions_to_fill = partitions.difference(self._filled_partitions) + if len(partitions_to_fill) == 0: + return # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION + row_set = RowSet() for partition in partitions_to_fill: row_set.add_row_range_from_keys( start_key=chr(partition), end_key=chr(partition + 1) ) + self.log.info(f"BigTableStore: Start filling cache for {self.bt_table.name} and partitions {partitions_to_fill}") if self._value_cache is not None: for row in self.bt_table.read_rows( row_set=row_set, filter_=CellsColumnLimitFilter(1) @@ -123,6 +117,7 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): ): self._key_cache.add(row.row_key) self._filled_partitions.update(partitions_to_fill) + self.log.info(f"BigTableStore: Finished filling cache for {self.bt_table.name} and partitions {partitions_to_fill}") def get(self, bt_key: bytes) -> Optional[bytes]: value = None From 3b234bb2fe0f6e016982d6d622c8e8c4b59e4597 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 11:41:12 +0100 Subject: [PATCH 264/616] formatting --- faust/stores/bigtable.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f6ec98c09..da6bc021d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -2,7 +2,17 @@ import logging import time import traceback -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Set, + Tuple, + Union, +) from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -104,7 +114,10 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): start_key=chr(partition), end_key=chr(partition + 1) ) - self.log.info(f"BigTableStore: Start filling cache for {self.bt_table.name} and partitions {partitions_to_fill}") + self.log.info( + f"BigTableStore: Start filling cache for {self.bt_table.name} " + f"and partitions {partitions_to_fill}" + ) if self._value_cache is not None: for row in self.bt_table.read_rows( row_set=row_set, filter_=CellsColumnLimitFilter(1) @@ -117,7 +130,10 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): ): self._key_cache.add(row.row_key) self._filled_partitions.update(partitions_to_fill) - self.log.info(f"BigTableStore: Finished filling cache for {self.bt_table.name} and partitions {partitions_to_fill}") + self.log.info( + f"BigTableStore: Finished filling cache for {self.bt_table.name} " + f"and partitions {partitions_to_fill}" + ) def get(self, bt_key: bytes) -> Optional[bytes]: value = None From f5a33ff7c87c610502c71fdd9555df5d87100e7d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 12:12:11 +0100 Subject: [PATCH 265/616] faster iterkeys with cache enabled --- faust/stores/bigtable.py | 65 ++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index da6bc021d..4decbb7f4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -167,20 +167,21 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self._key_cache is not None: - if bt_key in self._key_cache: - return True if self._value_cache is not None: - if bt_key in self._value_cache.keys(): + return bt_key in self._value_cache.keys() + elif self._key_cache is not None: + # Keycache is not filled so no assumptions about missing keys + if bt_key in self._key_cache: return True return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) - if self._key_cache is not None: - if not self._key_cache.isdisjoint(key_set): - return True + if self._value_cache is not None: + return not self._key_cache.isdisjoint(key_set) + elif self._key_cache is not None: + # Keycache is not filled so no assumptions about missing keys if not self._value_cache.keys().isdisjoint(key_set): return True # No assumption possible @@ -305,9 +306,10 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return value def _bigtable_contains(self, key: bytes) -> bool: - if self._cache.contains(key) is True: - # Never risk, false negatives - return True + cache_contains = self._cache.contains(key) + if cache_contains is not None: + return cache_contains + row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: self._cache.set(key, self.bigtable_exrtact_row_data(row)) @@ -317,10 +319,11 @@ def _bigtable_contains(self, key: bytes) -> bool: return False def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: + cache_contains = self._cache.contains_any(keys) + if cache_contains is not None: + return cache_contains + rows = RowSet() - if self._cache.contains_any(keys) is True: - # Never risk, false negatives - return True for key in keys: rows.add_row_key(key) @@ -511,20 +514,30 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: def _iterkeys(self) -> Iterator[bytes]: try: start = time.time() - row_set = RowSet() - for partition in self._active_partitions(): - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) + partitions = self._active_partitions() - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - if self._cache._value_cache is not None: - self._cache.set( - row.row_key, self.bigtable_exrtact_row_data(row) - ) - yield self._remove_partition_prefix(row.row_key) + if self._cache._value_cache is not None: + keys = set() + for p in partitions: + keys.add(self._get_partition_prefix(p)) + self._cache._fill_if_empty(keys) + for k in self._cache._value_cache.keys(): + yield self._remove_partition_prefix(k) + else: + row_set = RowSet() + for partition in partitions: + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + if self._cache._value_cache is not None: + self._cache.set( + row.row_key, self.bigtable_exrtact_row_data(row) + ) + yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info( f"Finished iterkeys for {self.table_name} in {end - start}s" From 3ece04180bf81a769831ebf62c0ba3b59e669bee Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 12:12:32 +0100 Subject: [PATCH 266/616] formatting --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4decbb7f4..702f06841 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -179,7 +179,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) if self._value_cache is not None: - return not self._key_cache.isdisjoint(key_set) + return not self._key_cache.isdisjoint(key_set) elif self._key_cache is not None: # Keycache is not filled so no assumptions about missing keys if not self._value_cache.keys().isdisjoint(key_set): From cacda8a5b3e9e057c5224bd0f8e1b080358a6029 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 13:08:42 +0100 Subject: [PATCH 267/616] fixed contains --- faust/stores/bigtable.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 702f06841..f40c28cee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -115,7 +115,7 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): ) self.log.info( - f"BigTableStore: Start filling cache for {self.bt_table.name} " + f"BigTableStore: Filling cache for {self.bt_table.name} " f"and partitions {partitions_to_fill}" ) if self._value_cache is not None: @@ -130,10 +130,6 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): ): self._key_cache.add(row.row_key) self._filled_partitions.update(partitions_to_fill) - self.log.info( - f"BigTableStore: Finished filling cache for {self.bt_table.name} " - f"and partitions {partitions_to_fill}" - ) def get(self, bt_key: bytes) -> Optional[bytes]: value = None @@ -179,10 +175,10 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) if self._value_cache is not None: - return not self._key_cache.isdisjoint(key_set) + return not self._value_cache.isdisjoint(key_set) elif self._key_cache is not None: # Keycache is not filled so no assumptions about missing keys - if not self._value_cache.keys().isdisjoint(key_set): + if not self._key_cache.keys().isdisjoint(key_set): return True # No assumption possible return None From 7cfe91def40fe038ed518384a87251a066095fea Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 13:15:25 +0100 Subject: [PATCH 268/616] return None if not found in value cache --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f40c28cee..4270615a5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -289,7 +289,7 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = self._cache.get(key) - if value is not None: + if value is not None or self._cache._value_cache is not None: return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) From 8d6e52b81436f3b214c6d67f0b8233ff9315b747 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 13:32:21 +0100 Subject: [PATCH 269/616] fixed contains any again --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4270615a5..fb38b28dc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -175,10 +175,10 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) if self._value_cache is not None: - return not self._value_cache.isdisjoint(key_set) + return not self._value_cache.keys().isdisjoint(key_set) elif self._key_cache is not None: # Keycache is not filled so no assumptions about missing keys - if not self._key_cache.keys().isdisjoint(key_set): + if not self._key_cache.isdisjoint(key_set): return True # No assumption possible return None From 5a0a4c4a3ac19af07bb5ac47627239483c370e9e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 18 Nov 2022 13:55:33 +0100 Subject: [PATCH 270/616] introduced is complete flag to determine if contains everything --- faust/stores/bigtable.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fb38b28dc..3a2bfe1bd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -97,6 +97,7 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._init_key_cache(options) self.partition_prefixes: Dict[int, bytes] self._filled_partitions: Set[int] = set() + self.is_complete = False def _fill_if_empty(self, bt_keys: Set[bytes]): partitions = set() @@ -164,7 +165,13 @@ def contains(self, bt_key: bytes) -> Optional[bool]: about the current key can be made. """ if self._value_cache is not None: - return bt_key in self._value_cache.keys() + + res = bt_key in self._value_cache.keys() + if self.is_complete: + return res + elif res is True: + return True + elif self._key_cache is not None: # Keycache is not filled so no assumptions about missing keys if bt_key in self._key_cache: @@ -175,7 +182,11 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) if self._value_cache is not None: - return not self._value_cache.keys().isdisjoint(key_set) + res = not self._value_cache.keys().isdisjoint(key_set) + if self.is_complete: + return res + elif res is True: + return True elif self._key_cache is not None: # Keycache is not filled so no assumptions about missing keys if not self._key_cache.isdisjoint(key_set): @@ -193,6 +204,8 @@ def _init_value_cache( ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) + if ttl == -1 and size is None: + self.is_complete = True else: self._value_cache = None @@ -289,7 +302,7 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = self._cache.get(key) - if value is not None or self._cache._value_cache is not None: + if value is not None or self._cache.is_complete: return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) @@ -512,7 +525,7 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() partitions = self._active_partitions() - if self._cache._value_cache is not None: + if self._cache.is_complete: keys = set() for p in partitions: keys.add(self._get_partition_prefix(p)) @@ -529,10 +542,6 @@ def _iterkeys(self) -> Iterator[bytes]: for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): - if self._cache._value_cache is not None: - self._cache.set( - row.row_key, self.bigtable_exrtact_row_data(row) - ) yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info( From cf0f51bf05944dcab0407c5703700e7c92933f84 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 09:23:54 +0100 Subject: [PATCH 271/616] added mutation buffer --- faust/stores/bigtable.py | 86 ++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3a2bfe1bd..2046b53b5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,24 +1,16 @@ """BigTable storage.""" import logging +import random import time import traceback -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - Optional, - Set, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet +from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache from yarl import URL @@ -71,7 +63,7 @@ def __delitem__(self, key): self.data.pop(key, None) def _maybe_ttl_clear(self): - if self.ttl != -1 and self.ttl_over is False: + if self.ttl != -1 and not self.ttl_over: now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} @@ -88,6 +80,7 @@ class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] _key_cache: Optional[Set] + _mutations: Dict[bytes, Tuple[DirectRow, Optional[bytes]]] def __init__(self, app, options: Dict, bt_table: Table) -> None: self.log = logging.getLogger(__name__) @@ -95,11 +88,13 @@ def __init__(self, app, options: Dict, bt_table: Table) -> None: self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) + self._init_mutation_buffer(options) self.partition_prefixes: Dict[int, bytes] self._filled_partitions: Set[int] = set() self.is_complete = False def _fill_if_empty(self, bt_keys: Set[bytes]): + # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION partitions = set() for k in bt_keys: partitions.add(k[0]) @@ -107,8 +102,6 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): if len(partitions_to_fill) == 0: return - # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION - row_set = RowSet() for partition in partitions_to_fill: row_set.add_row_range_from_keys( @@ -145,12 +138,14 @@ def set(self, bt_key: bytes, value: Optional[bytes]) -> None: self._value_cache[bt_key] = value if self._key_cache is not None: self._key_cache.add(bt_key) + self._set_mutation(bt_key, value) def delete(self, bt_key: bytes) -> None: if self._value_cache is not None: del self._value_cache[bt_key] if self._key_cache is not None: self._key_cache.discard(bt_key) + self._set_mutation(bt_key, None) def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -194,6 +189,35 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # No assumption possible return None + def flush_if_timer_over(self, tp: TP) -> bool: + now = time.time() + if now >= self._last_flush + self._mut_freq: + mutatations = [ + m[0] for m in self._mutations.values() if tp == m[0].row_key[0] + ] + response = self.bt_table.mutate_rows(mutatations) + for i, status in enumerate(response): + if status.code != 0: + self.log.error("Row number {} failed to write".format(i)) + return False # We don't want to clear the buffer on a failed write + else: + self._mutations.pop(mutatations[i].row_key) + return True + else: + return False + + def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): + if bt_key in self._mutations.keys(): + row = self._mutations[bt_key][0] + else: + row = self.bt_table.direct_row(bt_key) + row.set_cell( + "FaustColumnFamily", # TODO: Define this globally + "DATA", + value, + ) + self._mutations[bt_key] = row, value + def _init_value_cache( self, options ) -> Optional[Union[LRUCache, BigTableValueCache]]: @@ -220,6 +244,13 @@ def _init_key_cache(self, options: Dict): else: self._key_cache = None + def _init_mutation_buffer(self, options): + self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) + # To prevent that all tables write at the same time + random_start_offset = random.randint(0, self._mut_freq) + self._last_flush = time.time() + self._mut_freq - random_start_offset + self._mutations = {} + class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -235,6 +266,7 @@ class BigTableStore(base.SerializedStore): BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" + BT_MUTATION_FREQ_KEY = "bt_mutation_freq_key" KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" @@ -371,15 +403,17 @@ def _bigtable_get_range( def _bigtable_set( self, key: bytes, value: Optional[bytes], persist_offset=False ): - row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - row.commit() if not persist_offset: + # All mutatations set here will be flushed to BT later self._cache.set(key, value) + else: + row = self.bt_table.direct_row(key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) + row.commit() def _bigtable_del(self, key: bytes): row = self.bt_table.direct_row(key) @@ -639,10 +673,12 @@ def set_persisted_offset( we were not an active replica. """ try: - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), persist_offset=True - ) + if recovery or self._cache.flush_if_timer_over(tp): + offset_key = self.get_offset_key(tp).encode() + self._bigtable_set( + offset_key, str(offset).encode(), persist_offset=True + ) + except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From 2fa61013cd6be0bf8bcf946ed6a631c657b46417 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 10:02:49 +0100 Subject: [PATCH 272/616] fixed missing delete --- faust/stores/bigtable.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2046b53b5..df0dda506 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -211,11 +211,15 @@ def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): row = self._mutations[bt_key][0] else: row = self.bt_table.direct_row(bt_key) - row.set_cell( - "FaustColumnFamily", # TODO: Define this globally - "DATA", - value, - ) + + if value is None: + row.delete() + else: + row.set_cell( + "FaustColumnFamily", # TODO: Define this globally + "DATA", + value, + ) self._mutations[bt_key] = row, value def _init_value_cache( From 76c99fff64fc426bce27364d14510b1115dcfdc4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 11:00:53 +0100 Subject: [PATCH 273/616] removed set in contains --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index df0dda506..f2e899768 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -357,7 +357,6 @@ def _bigtable_contains(self, key: bytes) -> bool: row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: - self._cache.set(key, self.bigtable_exrtact_row_data(row)) return True # Just to be sure self._cache.delete(key) @@ -376,7 +375,6 @@ def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: row_set=rows, filter_=CellsColumnLimitFilter(1) ): # First hit will return - self._cache.set(row.row_key, self.bigtable_exrtact_row_data(row)) return True return False From e6b9bbd096e4e60513dbfa8f667513454211fad2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 11:01:20 +0100 Subject: [PATCH 274/616] linter --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f2e899768..4caa05f58 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -371,7 +371,7 @@ def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: for key in keys: rows.add_row_key(key) - for row in self.bt_table.read_rows( + for _row in self.bt_table.read_rows( row_set=rows, filter_=CellsColumnLimitFilter(1) ): # First hit will return From d97482b44144dc23c3336466c9744ad3d8f4dec5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 11:03:10 +0100 Subject: [PATCH 275/616] removed sets not in bigtable set --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4caa05f58..0e5b5e573 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -347,7 +347,6 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = None else: value = self.bigtable_exrtact_row_data(res) - self._cache.set(key, value) return value def _bigtable_contains(self, key: bytes) -> bool: @@ -396,7 +395,6 @@ def _bigtable_get_range( ): # First hit will return val = self.bigtable_exrtact_row_data(row) - self._cache.set(row.row_key, val) return row.row_key, val # Not found From a37471429ec8c8fce1adf7377f0ea7046a3ae4c8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 11:51:22 +0100 Subject: [PATCH 276/616] removed race condition --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0e5b5e573..7de4ddc32 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -4,6 +4,7 @@ import time import traceback from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from copy import deepcopy from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -192,8 +193,9 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() if now >= self._last_flush + self._mut_freq: + mutatations_copy = deepcopy(self._mutattions) mutatations = [ - m[0] for m in self._mutations.values() if tp == m[0].row_key[0] + m[0] for m in mutatations_copy.values() if tp == m[0].row_key[0] ] response = self.bt_table.mutate_rows(mutatations) for i, status in enumerate(response): From 7f2bbcf02ad804ef8d6fd3ea71ce4a7ca9507c87 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 12:58:36 +0100 Subject: [PATCH 277/616] fixed typo --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7de4ddc32..4f303a440 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -193,7 +193,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() if now >= self._last_flush + self._mut_freq: - mutatations_copy = deepcopy(self._mutattions) + mutatations_copy = deepcopy(self._mutations) mutatations = [ m[0] for m in mutatations_copy.values() if tp == m[0].row_key[0] ] From 3a7760482e1f5d263b5c9588bb45615534851825 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 25 Nov 2022 13:33:08 +0100 Subject: [PATCH 278/616] removed deepcopy --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4f303a440..f51fb5b25 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -4,7 +4,6 @@ import time import traceback from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union -from copy import deepcopy from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -193,7 +192,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() if now >= self._last_flush + self._mut_freq: - mutatations_copy = deepcopy(self._mutations) + mutatations_copy = self._mutations.copy() mutatations = [ m[0] for m in mutatations_copy.values() if tp == m[0].row_key[0] ] From 226ad11fb87568bec71dbb9c4d4831cdb157f731 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 29 Nov 2022 09:48:43 +0100 Subject: [PATCH 279/616] added log --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f51fb5b25..bbb64efdd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -199,10 +199,10 @@ def flush_if_timer_over(self, tp: TP) -> bool: response = self.bt_table.mutate_rows(mutatations) for i, status in enumerate(response): if status.code != 0: - self.log.error("Row number {} failed to write".format(i)) - return False # We don't want to clear the buffer on a failed write + self.log.error(f"Row number {i} failed to write") else: self._mutations.pop(mutatations[i].row_key) + self.log.info(f"BigTableStore: flushed {len(mutatations)} rows") return True else: return False From c12f94a7a018fa47cfffad115f8899a4a7e7bff3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 29 Nov 2022 10:44:32 +0100 Subject: [PATCH 280/616] flush on mutations only --- faust/stores/bigtable.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bbb64efdd..e44cb94be 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -191,21 +191,23 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() + flushed = False if now >= self._last_flush + self._mut_freq: mutatations_copy = self._mutations.copy() mutatations = [ m[0] for m in mutatations_copy.values() if tp == m[0].row_key[0] ] - response = self.bt_table.mutate_rows(mutatations) - for i, status in enumerate(response): - if status.code != 0: - self.log.error(f"Row number {i} failed to write") - else: - self._mutations.pop(mutatations[i].row_key) - self.log.info(f"BigTableStore: flushed {len(mutatations)} rows") - return True - else: - return False + if len(mutatations) > 0: + response = self.bt_table.mutate_rows(mutatations) + for i, status in enumerate(response): + if status.code != 0: + self.log.error(f"Row number {i} failed to write") + else: + self._mutations.pop(mutatations[i].row_key) + self.log.info(f"BigTableStore: flushed {len(mutatations)} rows") + flushed = True + self._last_flush = now + return flushed def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): if bt_key in self._mutations.keys(): From 4f7b6ec284a6e40c5b38a67ad8e6d1a450b275af Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 29 Nov 2022 11:18:49 +0100 Subject: [PATCH 281/616] fixed flush again --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e44cb94be..16a8d5ec9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,7 +195,7 @@ def flush_if_timer_over(self, tp: TP) -> bool: if now >= self._last_flush + self._mut_freq: mutatations_copy = self._mutations.copy() mutatations = [ - m[0] for m in mutatations_copy.values() if tp == m[0].row_key[0] + m[0] for m in mutatations_copy.values() if tp.partition == m[0].row_key[0] ] if len(mutatations) > 0: response = self.bt_table.mutate_rows(mutatations) From f48d332fa563e89abf1bb4c66d974f17828747df Mon Sep 17 00:00:00 2001 From: aoberegg Date: Wed, 30 Nov 2022 14:57:36 +0100 Subject: [PATCH 282/616] pin mode as long we didn't switch to newest faust version that supports python 3.11 --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index fc78ba658..e2e21d41e 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,7 +1,7 @@ aiohttp>=3.5.2,<4.0 aiohttp_cors>=0.7,<2.0 click>=6.7,<8.1 -mode-streaming>=0.2.0 +mode-streaming==0.2.1 aiokafka @ git+https://github.com/smaxtec/aiokafka-1@master#egg=aiokafka #aiokafka>=0.7.1,<0.8.0 opentracing>=1.3.0,<=2.4.0 From 3eadec1554cf6b4a9179ff70643d2551717dc5e3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 2 Dec 2022 08:38:46 +0100 Subject: [PATCH 283/616] removed log --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 16a8d5ec9..69bb21a48 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -8,9 +8,9 @@ from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance +from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet -from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.table import Table from mode.utils.collections import LRUCache from yarl import URL @@ -204,7 +204,6 @@ def flush_if_timer_over(self, tp: TP) -> bool: self.log.error(f"Row number {i} failed to write") else: self._mutations.pop(mutatations[i].row_key) - self.log.info(f"BigTableStore: flushed {len(mutatations)} rows") flushed = True self._last_flush = now return flushed From e88ee4c90115f0a915787069f95cf28e089fd3ea Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 2 Dec 2022 14:55:43 +0100 Subject: [PATCH 284/616] added first tests (current status 35% coverage) --- tests/unit/stores/test_bigtable.py | 325 +++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 tests/unit/stores/test_bigtable.py diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py new file mode 100644 index 000000000..e59ef4358 --- /dev/null +++ b/tests/unit/stores/test_bigtable.py @@ -0,0 +1,325 @@ +import sys +from unittest.mock import MagicMock, patch + +import pytest + +import faust +from faust.stores.bigtable import ( + BigTableCacheManager, + BigTableStore, + BigTableValueCache, +) + + +class TestBigTableStore: + TEST_KEY1 = b"TEST_KEY1" + TEST_KEY2 = b"TEST_KEY2" + TEST_KEY3 = b"TEST_KEY3" + + @pytest.fixture() + def bt_imports(self): + # We will mock rowsets in a way that it is just a + # list with all requested keys, so that we then just call + # read_row of the mocked bigtable multiple times + class RowSetMock(): + def __init__(self) -> None: + self.keys = set() + self.add_row_key = MagicMock(wraps=self._add_row_key) + + def _add_row_key(self, key): + self.keys.add(key) + + with patch("faust.stores.bigtable.BT") as bt: + bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + bt.column_family.MaxVersionsGCRule = MagicMock( + return_value="a_rule" + ) + bt.RowSet = MagicMock(return_value=RowSetMock()) + yield bt + + @pytest.mark.asyncio + async def test_bigtable_set_options_default(self, bt_imports): + self_mock = MagicMock() + bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + + BigTableStore._set_options(self_mock, options={}) + assert self_mock.column_name == "DATA" + assert self_mock.offset_key_prefix == "offset_partitiion:" + assert self_mock.row_filter == "a_filter" + + @pytest.mark.asyncio + async def test_bigtable_set_options(self, bt_imports): + self_mock = MagicMock() + bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + bt_imports.column_family = MagicMock(return_value=MagicMock()) + name_lambda = lambda x: print(x) # noqa + options = { + BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, + BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", + BigTableStore.BT_COLUMN_NAME_KEY: "name_test", + } + BigTableStore._set_options(self_mock, options) + assert self_mock.column_name == "name_test" + assert self_mock.offset_key_prefix == "offset_test" + assert self_mock.row_filter == "a_filter" + assert self_mock.table_name_generator == name_lambda + + @pytest.mark.asyncio + async def test_bigtable_setup(self, bt_imports): + self_mock = MagicMock() + + faust_table_mock = MagicMock() + faust_table_mock.name = MagicMock(return_value="ABC") + + def table_name_gen(table): + return table.name[::-1] + + self_mock.table_name_generator = table_name_gen + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) + + client_mock = MagicMock() + instance_mock = MagicMock() + table_mock = MagicMock() + + client_mock.instance = MagicMock(return_value=instance_mock) + instance_mock.table = MagicMock(return_value=table_mock) + table_mock.exists = MagicMock(return_value=True) + table_mock.create = MagicMock() + + bt_imports.Client = MagicMock(return_value=client_mock) + options = {} + options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" + options[BigTableStore.BT_PROJECT_KEY] = "bt_project" + + return_value = BigTableStore._bigtable_setup( + self_mock, faust_table_mock, options + ) + bt_imports.Client.assert_called_once_with( + options[BigTableStore.BT_PROJECT_KEY], admin=True + ) + client_mock.instance.assert_called_once_with( + options[BigTableStore.BT_INSTANCE_KEY] + ) + + instance_mock.table.assert_called_once_with(self_mock.bt_table_name) + table_mock.create.assert_not_called() + assert self_mock.column_family_id == "FaustColumnFamily" + assert return_value is None + + # Test with no existing table + self_mock.reset_mock() + self_mock.table_name_generator = table_name_gen + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) + table_mock.exists = MagicMock(return_value=False) + return_value = BigTableStore._bigtable_setup( + self_mock, faust_table_mock, options + ) + instance_mock.table.assert_called_once_with(self_mock.bt_table_name) + table_mock.create.assert_called_once_with( + column_families={self_mock.column_family_id: "a_rule"} + ) + assert self_mock.column_family_id == "FaustColumnFamily" + assert return_value is None + + @pytest.fixture() + def store(self, bt_imports): + class BigTableMock: + def __init__(self) -> None: + self.data = {} + self.read_row = MagicMock(wraps=self._read_row) + self.read_rows = MagicMock(wraps=self._read_rows) + + def _read_row(self, key: bytes, **kwargs): + res = self.data.get(key, None) + cell_wrapper = MagicMock() + cell_wrapper.value = res + row_wrapper = [cell_wrapper] + if res is None: + return res + row = MagicMock() + row.row_key = key + row.to_dict = MagicMock(return_value={"x": row_wrapper}) + return row + + def _read_rows(self, row_set, **kwargs): + for k in row_set.keys: + res = self._read_row(k) + if res is None: + continue + else: + yield res + + def add_test_data(self, keys): + for k in keys: + self.data[k] = k + + with patch("faust.stores.bigtable.BT", bt_imports): + options = {} + options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" + options[BigTableStore.BT_PROJECT_KEY] = "bt_project" + store = BigTableStore( + "bigtable://", MagicMock(), MagicMock(), options=options + ) + store.bt_table = BigTableMock() + return store + + def test_bigtable_bigtable_get_on_empty(self, store): + store._cache.get = MagicMock(return_value=None) + return_value = store._bigtable_get(self.TEST_KEY1) + store.bt_table.read_row.assert_called_once_with( + self.TEST_KEY1, filter_="a_filter" + ) + store._cache.get.assert_called_once_with(self.TEST_KEY1) + assert return_value is None + + def test_bigtable_bigtable_get_cache_miss(self, store): + store._cache.get = MagicMock(return_value=None) + store.bt_table.add_test_data([self.TEST_KEY1]) + return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.get.assert_called_once_with(self.TEST_KEY1) + store.bt_table.read_row.assert_called_once_with( + self.TEST_KEY1, filter_="a_filter" + ) + assert return_value == self.TEST_KEY1 + + def test_bigtable_bigtable_get_cache_hit(self, store): + store.bt_table.add_test_data([self.TEST_KEY1]) + store._cache.get = MagicMock(return_value=b"cache_res") + return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.get.assert_called_once_with(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + assert return_value == b"cache_res" + + def test_bigtable_get_range_cache_miss(self, store): + store._cache.get = MagicMock(return_value=None) + + test_keys_in = [self.TEST_KEY1, self.TEST_KEY3] # order is important + test_keys_not_in = {self.TEST_KEY2, } + + return_value = store._bigtable_get_range(test_keys_not_in) + store.bt_table.read_rows.assert_called() + store.bt_table.read_rows.reset_mock() + assert return_value == (None, None) + + store.bt_table.add_test_data(test_keys_in) + return_value = store._bigtable_get_range(test_keys_in) + store.bt_table.read_rows.assert_called() + store.bt_table.read_rows.reset_mock() + assert return_value == (self.TEST_KEY1, self.TEST_KEY1) + + def test_bigtable_get_range_cache_hit(self, store): + store._cache.get = MagicMock(return_value="cache_res") + result_value = store._bigtable_get_range([self.TEST_KEY1, self.TEST_KEY3]) + store.bt_table.read_rows.assert_not_called + assert result_value == (self.TEST_KEY1, "cache_res") + + def test_bigtable_contains(self, store): + store._cache.contains = MagicMock(return_value=None) + store._cache.delete = MagicMock(return_value=None) + + store.bt_table.add_test_data([self.TEST_KEY1]) + return_value = store._bigtable_contains(self.TEST_KEY1) + store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") + store._cache.delete.assert_not_called() + assert return_value is True + + return_value = store._bigtable_contains(self.TEST_KEY2) + store.bt_table.read_row.assert_called_with(self.TEST_KEY2, filter_="a_filter") + store._cache.delete.assert_called_with(self.TEST_KEY2) + + store._cache.delete.reset_mock() + store.bt_table.read_row.reset_mock() + + store._cache.contains = MagicMock(return_value=True) + return_value = store._bigtable_contains(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + store._cache.delete.assert_not_called() + assert return_value is True + + store._cache.contains = MagicMock(return_value=False) + return_value = store._bigtable_contains(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + store._cache.delete.assert_not_called() + assert return_value is False + + def test_bigtable_contains_any(self, store): + store.bt_table.add_test_data([self.TEST_KEY1]) + store._cache.contains_any = MagicMock(return_value=None) + + test_keys_in = {self.TEST_KEY1, self.TEST_KEY3} + test_keys_not_in = {self.TEST_KEY2, } + + return_value = store._bigtable_contains_any(test_keys_not_in) + store.bt_table.read_rows.assert_called() + store.bt_table.read_rows.reset_mock() + assert return_value is False + + return_value = store._bigtable_contains_any(test_keys_in) + store.bt_table.read_rows.assert_called() + store.bt_table.read_rows.reset_mock() + assert return_value is True + + store._cache.contains_any = MagicMock(return_value=True) + return_value = store._bigtable_contains_any(test_keys_not_in) + store.bt_table.read_rows.assert_not_called() + assert return_value == store._cache.contains_any() + + def test_bigtable_delete(self, store): + row_mock = MagicMock() + row_mock.commit = MagicMock() + row_mock.delete = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._cache.delete = MagicMock(return_value=None) + + store._bigtable_del(self.TEST_KEY1) + + store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) + store._cache.delete.assert_called_once_with(self.TEST_KEY1) + row_mock.delete.assert_called_once() + row_mock.commit.assert_called_once() + + def test_bigtable_set(self, store): + row_mock = MagicMock() + row_mock.set_cell = MagicMock() + row_mock.commit = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._cache.set = MagicMock(return_value=None) + + store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) + store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1, persist_offset=True) + + store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) + store._cache.set.assert_called_once_with(self.TEST_KEY1, self.TEST_KEY1) + row_mock.set_cell.assert_called_once_with( + store.column_family_id, + store.column_name, + self.TEST_KEY1, + ) + row_mock.commit.assert_called_once() + + def test_maybe_get_partition_from_message(self, store): + event_mock = MagicMock() + event_mock.message = MagicMock() + event_mock.message.partition = 69 + current_event_mock = MagicMock(return_value=event_mock) + + store.table.is_global = False + store.table.use_partitioner = False + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._maybe_get_partition_from_message() + assert return_value == 69 + + store.table.is_global = True + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._maybe_get_partition_from_message() + assert return_value is None + + store.table.is_global = False + current_event_mock = MagicMock(return_value=None) + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._maybe_get_partition_from_message() + assert return_value is None From ad0c10031a8c2c6df6e65c33f2924b00b6ff3f4f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 2 Dec 2022 15:27:57 +0100 Subject: [PATCH 285/616] adjusted imports to faust standard --- faust/stores/bigtable.py | 69 ++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 69bb21a48..59b7763b6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -5,13 +5,28 @@ import traceback from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union -from google.cloud.bigtable import column_family -from google.cloud.bigtable.client import Client -from google.cloud.bigtable.instance import Instance -from google.cloud.bigtable.row import DirectRow -from google.cloud.bigtable.row_filters import CellsColumnLimitFilter -from google.cloud.bigtable.row_set import RowSet -from google.cloud.bigtable.table import Table +try: # pragma: no cover + from google.cloud.bigtable import column_family + from google.cloud.bigtable.client import Client + from google.cloud.bigtable.instance import Instance + from google.cloud.bigtable.row import DirectRow + from google.cloud.bigtable.row_filters import CellsColumnLimitFilter + from google.cloud.bigtable.row_set import RowSet + from google.cloud.bigtable.table import Table + + # Make one container for all imported functions + # This is needed for testing and controlling the imports + class BT: + column_family = column_family + Client = Client + Instance = Instance + DirectRow = DirectRow + CellsColumnLimitFilter = CellsColumnLimitFilter + RowSet = RowSet + Table = Table +except ImportError: # pragma: no cover + BT = None # noqa + from mode.utils.collections import LRUCache from yarl import URL @@ -80,11 +95,11 @@ class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] _key_cache: Optional[Set] - _mutations: Dict[bytes, Tuple[DirectRow, Optional[bytes]]] + _mutations: Dict[bytes, Tuple[BT.DirectRow, Optional[bytes]]] - def __init__(self, app, options: Dict, bt_table: Table) -> None: + def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) - self.bt_table: Table = bt_table + self.bt_table: BT.Table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_key_cache(options) @@ -102,7 +117,7 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): if len(partitions_to_fill) == 0: return - row_set = RowSet() + row_set = BT.RowSet() for partition in partitions_to_fill: row_set.add_row_range_from_keys( start_key=chr(partition), end_key=chr(partition + 1) @@ -114,13 +129,13 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): ) if self._value_cache is not None: for row in self.bt_table.read_rows( - row_set=row_set, filter_=CellsColumnLimitFilter(1) + row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) ): value = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache[row.row_key] = value elif self._key_cache is not None: for row in self.bt_table.read_rows( - row_set=row_set, filter_=CellsColumnLimitFilter(1) + row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) ): self._key_cache.add(row.row_key) self._filled_partitions.update(partitions_to_fill) @@ -261,9 +276,9 @@ def _init_mutation_buffer(self, options): class BigTableStore(base.SerializedStore): """Bigtable table storage.""" - client: Client - instance: Instance - bt_table: Table + client: BT.Client + instance: BT.Instance + bt_table: BT.Table _cache: BigTableCacheManager partition_prefix = b"__" @@ -302,21 +317,21 @@ def _set_options(self, options) -> None: self.column_name = options.get( BigTableStore.BT_COLUMN_NAME_KEY, "DATA" ) - self.row_filter = CellsColumnLimitFilter(1) + self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" ) def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) - self.client: Client = Client( + self.client: BT.Client = BT.Client( options.get(BigTableStore.BT_PROJECT_KEY), admin=True, ) - self.instance: Instance = self.client.instance( + self.instance: BT.Instance = self.client.instance( options.get(BigTableStore.BT_INSTANCE_KEY) ) - self.bt_table: Table = self.instance.table(self.bt_table_name) + self.bt_table: BT.Table = self.instance.table(self.bt_table_name) self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): logging.getLogger(__name__).info( @@ -325,7 +340,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - self.column_family_id: column_family.MaxVersionsGCRule(1) + self.column_family_id: BT.column_family.MaxVersionsGCRule(1) } ) else: @@ -368,12 +383,12 @@ def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: if cache_contains is not None: return cache_contains - rows = RowSet() + rows = BT.RowSet() for key in keys: rows.add_row_key(key) for _row in self.bt_table.read_rows( - row_set=rows, filter_=CellsColumnLimitFilter(1) + row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): # First hit will return return True @@ -388,12 +403,12 @@ def _bigtable_get_range( if value is not None: return key, value - rows = RowSet() + rows = BT.RowSet() for key in keys: rows.add_row_key(key) for row in self.bt_table.read_rows( - row_set=rows, filter_=CellsColumnLimitFilter(1) + row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): # First hit will return val = self.bigtable_exrtact_row_data(row) @@ -535,7 +550,7 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - row_set = RowSet() + row_set = BT.RowSet() for partition in self._active_partitions(): prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) @@ -569,7 +584,7 @@ def _iterkeys(self) -> Iterator[bytes]: for k in self._cache._value_cache.keys(): yield self._remove_partition_prefix(k) else: - row_set = RowSet() + row_set = BT.RowSet() for partition in partitions: prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) From 2012390db048bf32743364c5959ded3e72c08588 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 5 Dec 2022 16:27:00 +0100 Subject: [PATCH 286/616] implemented all tests for bigtable --- tests/unit/stores/test_bigtable.py | 439 ++++++++++++++++++++++++++++- 1 file changed, 424 insertions(+), 15 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index e59ef4358..78b99f984 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,5 +1,4 @@ -import sys -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch import pytest @@ -9,6 +8,8 @@ BigTableStore, BigTableValueCache, ) +from faust.types.events import EventT +from faust.types.tuples import TP class TestBigTableStore: @@ -21,14 +22,20 @@ def bt_imports(self): # We will mock rowsets in a way that it is just a # list with all requested keys, so that we then just call # read_row of the mocked bigtable multiple times - class RowSetMock(): + class RowSetMock: def __init__(self) -> None: self.keys = set() self.add_row_key = MagicMock(wraps=self._add_row_key) + self.add_row_range_from_keys = MagicMock( + wraps=self._add_row_range_from_keys + ) def _add_row_key(self, key): self.keys.add(key) + def _add_row_range_from_keys(self, start: bytes, end: bytes): + self.keys.add(b"".join([start, b"_*_", end])) + with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") bt.column_family.MaxVersionsGCRule = MagicMock( @@ -147,11 +154,19 @@ def _read_row(self, key: bytes, **kwargs): def _read_rows(self, row_set, **kwargs): for k in row_set.keys: - res = self._read_row(k) - if res is None: + res = None + if b"_*_" in k: + for key in self.data.keys(): + start, end = k.split(b"_*_") + if start <= key < end: + yield self._read_row(key) continue else: - yield res + res = self._read_row(k) + if res is None: + continue + else: + yield res def add_test_data(self, keys): for k in keys: @@ -197,8 +212,10 @@ def test_bigtable_bigtable_get_cache_hit(self, store): def test_bigtable_get_range_cache_miss(self, store): store._cache.get = MagicMock(return_value=None) - test_keys_in = [self.TEST_KEY1, self.TEST_KEY3] # order is important - test_keys_not_in = {self.TEST_KEY2, } + test_keys_in = [self.TEST_KEY1, self.TEST_KEY3] # order is important + test_keys_not_in = { + self.TEST_KEY2, + } return_value = store._bigtable_get_range(test_keys_not_in) store.bt_table.read_rows.assert_called() @@ -209,11 +226,16 @@ def test_bigtable_get_range_cache_miss(self, store): return_value = store._bigtable_get_range(test_keys_in) store.bt_table.read_rows.assert_called() store.bt_table.read_rows.reset_mock() - assert return_value == (self.TEST_KEY1, self.TEST_KEY1) + assert return_value == ( + self.TEST_KEY1, + self.TEST_KEY1, + ) or return_value == ((self.TEST_KEY3, self.TEST_KEY3)) def test_bigtable_get_range_cache_hit(self, store): store._cache.get = MagicMock(return_value="cache_res") - result_value = store._bigtable_get_range([self.TEST_KEY1, self.TEST_KEY3]) + result_value = store._bigtable_get_range( + [self.TEST_KEY1, self.TEST_KEY3] + ) store.bt_table.read_rows.assert_not_called assert result_value == (self.TEST_KEY1, "cache_res") @@ -223,12 +245,16 @@ def test_bigtable_contains(self, store): store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY1, filter_="a_filter" + ) store._cache.delete.assert_not_called() assert return_value is True return_value = store._bigtable_contains(self.TEST_KEY2) - store.bt_table.read_row.assert_called_with(self.TEST_KEY2, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY2, filter_="a_filter" + ) store._cache.delete.assert_called_with(self.TEST_KEY2) store._cache.delete.reset_mock() @@ -251,7 +277,9 @@ def test_bigtable_contains_any(self, store): store._cache.contains_any = MagicMock(return_value=None) test_keys_in = {self.TEST_KEY1, self.TEST_KEY3} - test_keys_not_in = {self.TEST_KEY2, } + test_keys_not_in = { + self.TEST_KEY2, + } return_value = store._bigtable_contains_any(test_keys_not_in) store.bt_table.read_rows.assert_called() @@ -290,10 +318,14 @@ def test_bigtable_set(self, store): store._cache.set = MagicMock(return_value=None) store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1, persist_offset=True) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, persist_offset=True + ) store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) - store._cache.set.assert_called_once_with(self.TEST_KEY1, self.TEST_KEY1) + store._cache.set.assert_called_once_with( + self.TEST_KEY1, self.TEST_KEY1 + ) row_mock.set_cell.assert_called_once_with( store.column_family_id, store.column_name, @@ -323,3 +355,380 @@ def test_maybe_get_partition_from_message(self, store): with patch("faust.stores.bigtable.current_event", current_event_mock): return_value = store._maybe_get_partition_from_message() assert return_value is None + + def test_get_partition_prefix(self, store): + partition = 0 + res = store._get_partition_prefix(partition) + assert res[0] == partition + assert res[1:] == store.partition_prefix + + partition = 19 + res = store._get_partition_prefix(partition) + assert res[0] == partition + assert res[1:] == store.partition_prefix + + def test_remove_partition_prefix(self, store): + store.partition_prefix = b"abc" + key_with_partition = b"abcTHEACTUALKEY" + res = store._remove_partition_prefix(key_with_partition) + assert res == b"THEACTUALKEY" + + def test_get_key_with_partition(self, store): + partition = 19 + res = store._get_key_with_partition(self.TEST_KEY1, partition) + assert res[0] == partition + assert store._remove_partition_prefix(res) == self.TEST_KEY1 + + def test_partitions_for_key(self, store): + store._cache.get_partition = MagicMock(return_value=19) + res = store._partitions_for_key(self.TEST_KEY1) + store._cache.get_partition.assert_called_once_with(self.TEST_KEY1) + assert res == [19] + + store._cache.get_partition = MagicMock(side_effect=KeyError) + store._active_partitions = MagicMock(return_value=[1, 2, 3]) + res = store._partitions_for_key(self.TEST_KEY2) + store._cache.get_partition.assert_called_once_with(self.TEST_KEY2) + assert res == [1, 2, 3] + + def test_get_with_known_partition(self, store): + partition = 19 + store._maybe_get_partition_from_message = MagicMock( + return_value=partition + ) + store._cache.set_partition = MagicMock() + # Scenario: Found + store._bigtable_get = MagicMock(return_value=b"a_value") + res = store._get(self.TEST_KEY1) + key_with_partition = store._get_key_with_partition( + self.TEST_KEY1, partition + ) + store._bigtable_get.assert_called_once_with(key_with_partition) + store._cache.set_partition.assert_called_once_with( + self.TEST_KEY1, partition + ) + assert res == b"a_value" + + store._cache.set_partition.reset_mock() + # Scenario: Not Found + store._bigtable_get = MagicMock(return_value=None) + res = store._get(self.TEST_KEY1) + key_with_partition = store._get_key_with_partition( + self.TEST_KEY1, partition + ) + store._bigtable_get.assert_called_once_with(key_with_partition) + store._cache.set_partition.assert_not_called() + assert res is None + + def test_get_with_unknown_partition(self, store): + store._maybe_get_partition_from_message = MagicMock(return_value=None) + store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) + store._cache.set_partition = MagicMock() + keys_searched = set() + keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 3)) + keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 19)) + + # Scenario: Found + key_of_value = store._get_key_with_partition(self.TEST_KEY1, 19) + store._bigtable_get_range = MagicMock( + return_value=(key_of_value, b"a_value") + ) + res = store._get(self.TEST_KEY1) + store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) + store._bigtable_get_range.assert_called_once_with(keys_searched) + store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, 19) + assert res == b"a_value" + + store._cache.set_partition.reset_mock() + # Scenario: Not Found + store._bigtable_get_range = MagicMock(return_value=(None, None)) + res = store._get(self.TEST_KEY1) + store._bigtable_get_range.assert_called_once_with(keys_searched) + store._cache.set_partition.assert_not_called() + assert res is None + + def test_set(self, store): + partition = 19 + faust.stores.bigtable.get_current_partition = MagicMock( + return_value=partition + ) + store._bigtable_set = MagicMock() + store._cache.set_partition = MagicMock() + store._set(self.TEST_KEY1, b"a_value") + key_with_partition = store._get_key_with_partition( + self.TEST_KEY1, partition + ) + store._bigtable_set.assert_called_once_with( + key_with_partition, b"a_value" + ) + store._cache.set_partition.assert_called_once_with( + self.TEST_KEY1, partition + ) + + def test_del(self, store): + store._cache._partition_cache = {self.TEST_KEY1: 19} + store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) + store._bigtable_del = MagicMock() + store._del(self.TEST_KEY1) + calls = [ + call(store._get_key_with_partition(self.TEST_KEY1, 1)), + call(store._get_key_with_partition(self.TEST_KEY1, 3)), + call(store._get_key_with_partition(self.TEST_KEY1, 19)), + ] + store._bigtable_del.assert_has_calls(calls) + assert store._cache._partition_cache == {} + + def test_active_partitions(self, store): + active_topics = [ + TP("a_changelogtopic", 19), + TP("a_different_chaneglogtopic", 19), + ] + store.app.assignor.assigned_actives = MagicMock( + return_value=active_topics + ) + store.app.conf.topic_partitions = 20 + store.table.changelog_topic_name = "a_changelogtopic" + store.table.is_global = False + + # Scenario: No global table + res = store._active_partitions() + all_res = list(res) + assert all_res == [19] + + # Scenario: Global table + store.table.is_global = True + res = store._active_partitions() + all_res = list(res) + assert list(range(store.app.conf.topic_partitions)) == all_res + + def test_iteritems(self, store): + keys_in_store = [] + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + + store.bt_table.add_test_data(keys_in_store) + store._active_partitions = MagicMock(return_value=[1, 3]) + all_res = sorted(store._iteritems()) + assert all_res == [ + (self.TEST_KEY1, keys_in_store[0]), + (self.TEST_KEY3, keys_in_store[2]), + ] + + def test_iterkeys_with_complete_cache(self, store): + store._cache.is_complete = True + store._active_partitions = MagicMock(return_value=[1, 3]) + store._cache._fill_if_empty = MagicMock() + + keys_in_cache = [] + keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY2, 2)) + keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + + store._cache._value_cache = MagicMock() + store._cache._value_cache.keys = MagicMock(return_value=keys_in_cache) + all_res = sorted(store._iterkeys()) + store._cache._fill_if_empty.assert_called_once_with( + { + store._get_partition_prefix(1), + store._get_partition_prefix(3), + } + ) + assert all_res == [ + self.TEST_KEY1, + self.TEST_KEY2, + self.TEST_KEY3, + ] + + def test_iterkeys_with_no_complete_cache(self, store): + store._cache.is_complete = False + store._active_partitions = MagicMock(return_value=[1, 3]) + store._cache._fill_if_empty = MagicMock() + keys_in_store = [] + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + store.bt_table.add_test_data(keys_in_store) + + all_res = sorted(store._iterkeys()) + store._cache._fill_if_empty.assert_not_called() + assert all_res == [ + self.TEST_KEY1, + self.TEST_KEY3, + ] + + def test_iteritems(self, store): + keys_in_store = [] + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + + store.bt_table.add_test_data(keys_in_store) + store._active_partitions = MagicMock(return_value=[1, 3]) + all_res = sorted(store._itervalues()) + assert all_res == [keys_in_store[0], keys_in_store[2]] + + def test_size(self, store): + assert 0 == store._size() + + def test_contains_without_store_check_exists(self, store): + store._bigtable_contains = MagicMock() + store._bigtable_contains_any = MagicMock() + store.app.conf.store_check_exists = False + + res = store._contains(self.TEST_KEY1) + + assert res is True + store._bigtable_contains_any.assert_not_called() + store._bigtable_contains.assert_not_called() + + def test_contains_with_known_partition(self, store): + store.app.conf.store_check_exists = True + store._bigtable_contains_any = MagicMock() + store._maybe_get_partition_from_message = MagicMock(return_value=19) + + # Scenario1: Found + store._bigtable_contains = MagicMock(return_value="TRUE_OR_FALSE") + key_w_partition = store._get_key_with_partition(self.TEST_KEY1, 19) + res = store._contains(self.TEST_KEY1) + store._bigtable_contains.assert_called_once_with(key_w_partition) + assert res == "TRUE_OR_FALSE" + + def test_contains_with_unknown_partition(self, store): + store.app.conf.store_check_exists = True + store._bigtable_contains_any = MagicMock() + store._maybe_get_partition_from_message = MagicMock(return_value=None) + store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) + + store._bigtable_contains_any = MagicMock(return_value="TRUE_OR_FALSE") + keys_to_search = set() + keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 3)) + keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 19)) + + res = store._contains(self.TEST_KEY1) + + store._bigtable_contains_any.assert_called_once_with(keys_to_search) + assert res == "TRUE_OR_FALSE" + + def test_get_offset_key(self, store): + tp = TP("AAAA", 19) + assert store.get_offset_key(tp)[-2:] == "19" + + def test_persisted_offset(self, store): + tp = TP("AAAA", 19) + store.get_offset_key = MagicMock(return_value=123) + store.bt_table.add_test_data([123]) + assert store.persisted_offset(tp) == 123 + + def test_set_persisted_offset(self, store): + tp = TP("a_topic", 19) + + store._bigtable_set = MagicMock() + + # Scenario 0: No recovery && no flush + recovery = False + store._cache.flush_if_timer_over = MagicMock(return_value=False) + expected_offset_key = store.get_offset_key(tp).encode() + store.set_persisted_offset(tp, 123, recovery=recovery) + store._bigtable_set.assert_not_called() + + # Scenario 1: Recovery + recovery = True + store._cache.flush_if_timer_over = MagicMock(return_value=False) + expected_offset_key = store.get_offset_key(tp).encode() + store.set_persisted_offset(tp, 123, recovery=recovery) + store._bigtable_set.assert_called_once_with( + expected_offset_key, str(123).encode(), persist_offset=True + ) + + # Scenario 2: Mutattion buffer flush + recovery = False + store._cache.flush_if_timer_over = MagicMock(return_value=True) + expected_offset_key = store.get_offset_key(tp).encode() + store.set_persisted_offset(tp, 123, recovery=recovery) + store._bigtable_set.assert_called_with( + expected_offset_key, str(123).encode(), persist_offset=True + ) + + def test_persist_changelog_batch(self, store): + class TestResponse: + def __init__(self, code) -> None: + self.code = code + + # Scenario 1: no failure + store.bt_table.mutate_rows = MagicMock( + return_value=[TestResponse(0)] * 10 + ) + store.log = MagicMock() + store.log.error = MagicMock() + store.set_persisted_offset = MagicMock() + tp1 = TP("offset1", 10) + tp2 = TP("offset2", 10) + tp3 = TP("offset3", 10) + offset_batch = { + tp1: 111, + tp2: 222, + tp3: 333, + } + store._persist_changelog_batch( + ["row1", "row2", "etc..."], offset_batch + ) + store.bt_table.mutate_rows.assert_called_with( + ["row1", "row2", "etc..."] + ) + + assert store.set_persisted_offset.call_count == len(offset_batch) + store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) + store.log.error.assert_not_called() + + # Scenario 2: all failure + store.set_persisted_offset.reset_mock() + store.bt_table.mutate_rows.reset_mock() + store.bt_table.mutate_rows = MagicMock( + return_value=[TestResponse(404)] + ) + store._persist_changelog_batch( + ["row1", "row2", "etc..."], offset_batch + ) + # FIXME: I'm not sure if we want that behaviour. + # Question: What should happen on a failed mutated row in recovery. + store.set_persisted_offset.assert_called() + store.log.error.assert_called() + + def test_apply_changelog_batch(self, store): + row_mock = MagicMock() + row_mock.delete = MagicMock() + row_mock.set_cell = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store.bt_table.mutate_rows = MagicMock() + store._persist_changelog_batch = MagicMock() + + class TestMessage: + def __init__(self, value, key, tp, offset): + self.value = value + self.key = key + self.tp = tp + self.offset = offset + + class TestEvent: + def __init__(self, message): + self.message = message + + tp = TP("a", 19) + tp2 = TP("b", 19) + messages = [ + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 0)), + TestEvent(TestMessage(None, self.TEST_KEY1, tp, 1)), # Delete + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 3)), # Out of order + TestEvent(TestMessage("b", self.TEST_KEY2, tp2, 4)), + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), + ] + store.apply_changelog_batch(messages, lambda x: x, lambda x: x) + assert store.bt_table.direct_row.call_count == 5 + row_mock.delete.assert_called_once() + assert row_mock.set_cell.call_count == 4 + store._persist_changelog_batch.assert_called_once() + tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] + assert tp_offsets == {tp: 3, tp2: 4} From 908bf6bb09eff772b4d58afaad58da0a77d98b7c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 5 Dec 2022 17:02:42 +0100 Subject: [PATCH 287/616] added tests for value cache --- tests/unit/stores/test_bigtable.py | 63 +++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 78b99f984..6dd4de38e 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,4 +1,6 @@ +import time from unittest.mock import MagicMock, call, patch +from mode.utils.collections import LRUCache import pytest @@ -8,10 +10,68 @@ BigTableStore, BigTableValueCache, ) -from faust.types.events import EventT from faust.types.tuples import TP +class TestBigTableValueCache: + def test_init(self): + # Test defaults + cache = BigTableValueCache() + assert cache.data == {} + assert cache.ttl == -1 + assert cache.ttl_over is False + + # Test with custom size + cache = BigTableValueCache(size=123) + assert isinstance(cache.data, LRUCache) + assert cache.data.limit == 123 + + def test__set_del_len_and_getitem(self): + cache = BigTableValueCache() + # Scenario ttl not over and no clear + cache._maybe_ttl_clear = MagicMock() + assert len(cache) == 0 + cache["123"] = 123 + assert cache._maybe_ttl_clear.call_count == 1 + assert len(cache) == 1 + assert cache["123"] == 123 + assert cache._maybe_ttl_clear.call_count == 2 + del cache["123"] + assert cache._maybe_ttl_clear.call_count == 2 + assert len(cache) == 0 + + def test__set_del_len_and_getitem_after_tttl(self): + cache = BigTableValueCache() + # Scenario ttl over and clear + cache._maybe_ttl_clear = MagicMock() + cache.ttl_over = True + assert len(cache) == 0 + cache["123"] = 123 + assert cache._maybe_ttl_clear.call_count == 1 + assert len(cache) == 0 + assert "123" not in cache.keys() + assert cache._maybe_ttl_clear.call_count == 1 + del cache["123"] + assert cache._maybe_ttl_clear.call_count == 1 + assert len(cache) == 0 + + def test_maybe_ttl_clear(self): + time.time = MagicMock(return_value=0) + cache = BigTableValueCache(ttl=5) + assert cache.init_ts == 0 + + cache._maybe_ttl_clear() + assert cache.ttl_over is False # Nothing cleared + + time.time.return_value = 5 + cache._maybe_ttl_clear() + assert cache.ttl_over is False # Nothing cleared, edge case + + time.time.return_value = 6 + cache._maybe_ttl_clear() + assert cache.ttl_over is True # Nothing cleared, edge case + + class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" TEST_KEY2 = b"TEST_KEY2" @@ -176,6 +236,7 @@ def add_test_data(self, keys): options = {} options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" options[BigTableStore.BT_PROJECT_KEY] = "bt_project" + options[BigTableStore.VALUE_CACHE_ENABLE_KEY] = True store = BigTableStore( "bigtable://", MagicMock(), MagicMock(), options=options ) From 49f8a882ae4b2697944cb2d9ec01645220ec1877 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 14:31:03 +0100 Subject: [PATCH 288/616] removed is complete in bigtable_get --- faust/stores/bigtable.py | 45 ++++++++-------------------------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 59b7763b6..9827c9ac8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -94,19 +94,16 @@ def keys(self): class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] - _key_cache: Optional[Set] _mutations: Dict[bytes, Tuple[BT.DirectRow, Optional[bytes]]] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: + self.is_complete = False self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) - self._init_key_cache(options) self._init_mutation_buffer(options) - self.partition_prefixes: Dict[int, bytes] self._filled_partitions: Set[int] = set() - self.is_complete = False def _fill_if_empty(self, bt_keys: Set[bytes]): # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION @@ -131,13 +128,13 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): for row in self.bt_table.read_rows( row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) ): - value = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache[row.row_key] = value - elif self._key_cache is not None: - for row in self.bt_table.read_rows( - row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) - ): - self._key_cache.add(row.row_key) + if row.row_key in self._mutations.keys(): + mutation_val = self._mutations[row.row_key][1] + if mutation_val is not None: + self._value_cache[row.row_key] = mutation_val + else: + value = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache[row.row_key] = value self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: @@ -151,15 +148,11 @@ def get(self, bt_key: bytes) -> Optional[bytes]: def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value - if self._key_cache is not None: - self._key_cache.add(bt_key) self._set_mutation(bt_key, value) def delete(self, bt_key: bytes) -> None: if self._value_cache is not None: del self._value_cache[bt_key] - if self._key_cache is not None: - self._key_cache.discard(bt_key) self._set_mutation(bt_key, None) def get_partition(self, user_key: bytes) -> int: @@ -181,11 +174,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return res elif res is True: return True - - elif self._key_cache is not None: - # Keycache is not filled so no assumptions about missing keys - if bt_key in self._key_cache: - return True return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: @@ -197,10 +185,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return res elif res is True: return True - elif self._key_cache is not None: - # Keycache is not filled so no assumptions about missing keys - if not self._key_cache.isdisjoint(key_set): - return True # No assumption possible return None @@ -254,17 +238,6 @@ def _init_value_cache( else: self._value_cache = None - def _init_key_cache(self, options: Dict): - key_cache_enabled = options.get( - BigTableStore.KEY_CACHE_ENABLE_KEY, False - ) - # We don't need a key cache if we use a value cache already - if self._value_cache is None and key_cache_enabled: - if key_cache_enabled: - self._key_cache = set() - else: - self._key_cache = None - def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) # To prevent that all tables write at the same time @@ -355,7 +328,7 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = self._cache.get(key) - if value is not None or self._cache.is_complete: + if value is not None: return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) From 48c3ad16d79a733dd2e03acee816137809328cf0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 15:01:51 +0100 Subject: [PATCH 289/616] added max mutations setting --- faust/stores/bigtable.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9827c9ac8..519c9b469 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -3,7 +3,17 @@ import random import time import traceback -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Set, + Tuple, + Union, +) try: # pragma: no cover from google.cloud.bigtable import column_family @@ -24,6 +34,7 @@ class BT: CellsColumnLimitFilter = CellsColumnLimitFilter RowSet = RowSet Table = Table + except ImportError: # pragma: no cover BT = None # noqa @@ -191,10 +202,14 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() flushed = False - if now >= self._last_flush + self._mut_freq: + last_flush = self._last_flush.get(tp.partition, now - self._mut_freq) + max_reached = len(self._mutations) >= self._max_mutations + if now >= last_flush + self._mut_freq or max_reached: mutatations_copy = self._mutations.copy() mutatations = [ - m[0] for m in mutatations_copy.values() if tp.partition == m[0].row_key[0] + m[0] + for m in mutatations_copy.values() + if tp.partition == m[0].row_key[0] ] if len(mutatations) > 0: response = self.bt_table.mutate_rows(mutatations) @@ -203,8 +218,8 @@ def flush_if_timer_over(self, tp: TP) -> bool: self.log.error(f"Row number {i} failed to write") else: self._mutations.pop(mutatations[i].row_key) - flushed = True - self._last_flush = now + flushed = True + self._last_flush[tp.partition] = now return flushed def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): @@ -242,7 +257,12 @@ def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) # To prevent that all tables write at the same time random_start_offset = random.randint(0, self._mut_freq) - self._last_flush = time.time() + self._mut_freq - random_start_offset + self._last_flush = ( + {} + ) # time.time() + self._mut_freq - random_start_offset + self._max_mutations = options.get( + BigTableStore.BT_MAX_MUTATIONS, 10000 + ) self._mutations = {} @@ -261,7 +281,7 @@ class BigTableStore(base.SerializedStore): BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" BT_MUTATION_FREQ_KEY = "bt_mutation_freq_key" - KEY_CACHE_ENABLE_KEY = "key_cache_enable_key" + BT_MAX_MUTATIONS = "bt_max_mutations" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" @@ -313,7 +333,9 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - self.column_family_id: BT.column_family.MaxVersionsGCRule(1) + self.column_family_id: BT.column_family.MaxVersionsGCRule( + 1 + ) } ) else: From d6281151d847dc3a2dd78ce0c4136a260c610655 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 15:02:10 +0100 Subject: [PATCH 290/616] added unit tests --- tests/unit/stores/test_bigtable.py | 421 +++++++++++++++++++++++++---- 1 file changed, 362 insertions(+), 59 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 6dd4de38e..9834c6b49 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -13,6 +13,73 @@ from faust.types.tuples import TP +class TestResponse: + def __init__(self, code) -> None: + self.code = code + + +class RowSetMock: + # We will mock rowsets in a way that it is just a + # list with all requested keys, so that we then just call + # read_row of the mocked bigtable multiple times + def __init__(self) -> None: + self.keys = set() + self.add_row_key = MagicMock(wraps=self._add_row_key) + self.add_row_range_from_keys = MagicMock( + wraps=self._add_row_range_from_keys + ) + + def _add_row_key(self, key): + self.keys.add(key) + + def _add_row_range_from_keys(self, start_key: bytes, end_key: bytes): + if isinstance(start_key, str): + start_key = start_key.encode() + if isinstance(end_key, str): + end_key = end_key.encode() + self.keys.add(b"".join([start_key, b"_*_", end_key])) + + +class BigTableMock: + def __init__(self) -> None: + self.data = {} + self.read_row = MagicMock(wraps=self._read_row) + self.read_rows = MagicMock(wraps=self._read_rows) + self.name = "test_bigtable" + + def _read_row(self, key: bytes, **kwargs): + res = self.data.get(key, None) + cell_wrapper = MagicMock() + cell_wrapper.value = res + row_wrapper = [cell_wrapper] + if res is None: + return res + row = MagicMock() + row.row_key = key + row.to_dict = MagicMock(return_value={"x": row_wrapper}) + return row + + def _read_rows(self, row_set, **kwargs): + for k in row_set.keys: + res = None + if b"_*_" in k: + for key in self.data.keys(): + start, end = k.split(b"_*_") + if start <= key < end: + yield self._read_row(key) + continue + else: + res = self._read_row(k) + if res is None: + continue + else: + yield res + + def add_test_data(self, keys): + for k in keys: + self.data[k] = k + + class TestBigTableValueCache: def test_init(self): # Test defaults @@ -72,30 +139,308 @@ def test_maybe_ttl_clear(self): assert cache.ttl_over is True # Nothing cleared, edge case -class TestBigTableStore: - TEST_KEY1 = b"TEST_KEY1" - TEST_KEY2 = b"TEST_KEY2" - TEST_KEY3 = b"TEST_KEY3" +class TestBigTableCacheManager: + def test_default__init__(self): + bigtable_mock = BigTableMock() + app_mock = MagicMock() + app_mock.conf = MagicMock() + app_mock.conf.table_key_index_size = 123 + time.time = MagicMock(return_value=0) + + test_manager = BigTableCacheManager(MagicMock(), {}, bigtable_mock) + assert test_manager.bt_table == bigtable_mock + assert test_manager.is_complete is False + assert test_manager._value_cache is None + assert test_manager._mut_freq == 0 + assert test_manager._last_flush == {} + assert test_manager._mutations == {} + assert test_manager._filled_partitions == set() + + def test_iscomplete__init__(self): + bigtable_mock = BigTableMock() + app_mock = MagicMock() + app_mock.conf = MagicMock() + app_mock.conf.table_key_index_size = 2 + time.time = MagicMock(return_value=0) + options = { + BigTableStore.VALUE_CACHE_ENABLE_KEY: True, + } + + test_manager = BigTableCacheManager( + MagicMock(), options, bigtable_mock + ) + assert test_manager.bt_table == bigtable_mock + assert test_manager.is_complete is True + assert isinstance(test_manager._value_cache, BigTableValueCache) + assert test_manager._mut_freq == 0 + assert test_manager._last_flush == {} + assert test_manager._mutations == {} + assert test_manager._filled_partitions == set() @pytest.fixture() def bt_imports(self): - # We will mock rowsets in a way that it is just a - # list with all requested keys, so that we then just call - # read_row of the mocked bigtable multiple times - class RowSetMock: - def __init__(self) -> None: - self.keys = set() - self.add_row_key = MagicMock(wraps=self._add_row_key) - self.add_row_range_from_keys = MagicMock( - wraps=self._add_row_range_from_keys + with patch("faust.stores.bigtable.BT") as bt: + bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + bt.column_family.MaxVersionsGCRule = MagicMock( + return_value="a_rule" + ) + bt.RowSet = MagicMock(return_value=RowSetMock()) + yield bt + + @pytest.fixture() + def manager(self, bt_imports): + with patch("faust.stores.bigtable.BT", bt_imports): + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): + bigtable_mock = BigTableMock() + app_mock = MagicMock() + app_mock.conf = MagicMock() + app_mock.conf.table_key_index_size = 123 + + options = { + BigTableStore.VALUE_CACHE_ENABLE_KEY: True, + BigTableStore.BT_MUTATION_FREQ_KEY: 600, + } + manager = BigTableCacheManager( + MagicMock(), options, bigtable_mock ) + manager._partition_cache = {} + return manager + + def test_fill_if_empty(self, manager): + key = b"\x13AAA" + manager.bt_table.add_test_data({key}) + # Scenario 1: Everything empty + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._filled_partitions == {19} + + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._filled_partitions == {19} + + manager._fill_if_empty({b"\x10XXX"}) + assert manager.bt_table.read_rows.call_count == 2 + assert manager._filled_partitions == {19, 16} + assert manager.contains(key) + + def test_fill_if_empty_with_mutation(self, manager): + key = b"\x13AAA" + manager.bt_table.add_test_data({key}) + manager._mutations = {key: (MagicMock(), "some_row_mutation")} + manager._fill_if_empty({key}) + assert manager.contains(key) + assert manager.get(key) == "some_row_mutation" + + def test_get(self, manager): + # Adding the key here is sufficient, because the cache gets filled + key_in = b"\x13AAA" + key_not_in = b"\x13BBB" + manager.bt_table.add_test_data({key_in}) + + manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) + + res = manager.get(key_in) + manager._fill_if_empty.assert_called_once_with({key_in}) + assert res == key_in + + res = manager.get(key_not_in) + manager._fill_if_empty.assert_called_with({key_not_in}) + assert res is None - def _add_row_key(self, key): - self.keys.add(key) + manager._value_cache = None + res = manager.get(key_in) + manager._fill_if_empty.assert_called_with({key_in}) + assert res is None - def _add_row_range_from_keys(self, start: bytes, end: bytes): - self.keys.add(b"".join([start, b"_*_", end])) + def test_set(self, manager): + manager._set_mutation = MagicMock() + key_1 = b"\x13AAA" + key_2 = b"\x13ABB" + manager.set(key_1, key_1) + manager._set_mutation.assert_called_once_with(key_1, key_1) + assert manager.contains(key_1) + assert manager.contains(key_2) is False + + manager.set(key_2, key_2) + manager._set_mutation.assert_called_with(key_2, key_2) + assert manager.contains(key_1) + assert manager.contains(key_2) + assert manager.get(key_1) == key_1 + assert manager.get(key_2) == key_2 + + def test_delete(self, manager): + manager._set_mutation = MagicMock() + key_1 = b"\x13AAA" + key_2 = b"\x13ABB" + manager.set(key_1, key_1) + assert manager.contains(key_1) + manager.delete(key_1) + manager._set_mutation.assert_called_with(key_1, None) + assert not manager.contains(key_1) + manager.delete(key_2) + manager._set_mutation.assert_called_with(key_2, None) + + def test_partition_cache(self, manager): + key = b"aaa" + with pytest.raises(KeyError): + manager.get_partition(key) + manager.set_partition(key, 13) + assert manager.get_partition(key) == 13 + manager.set_partition(key, 15) + assert manager.get_partition(key) == 15 + + def test_contains(self, manager): + # Adding the key here is sufficient, because the cache gets filled + key_in = b"\x13AAA" + key_not_in = b"\x13BBB" + manager.bt_table.add_test_data({key_in}) + manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) + + manager.is_complete = True + assert manager.contains(key_in) is True + manager._fill_if_empty.assert_called_with({key_in}) + assert manager.contains(key_not_in) is False + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager.is_complete = False + assert manager.contains(key_in) is True + manager._fill_if_empty.assert_called_with({key_in}) + assert manager.contains(key_not_in) is None + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager._value_cache = None + assert manager.contains(key_in) is None + manager._fill_if_empty.assert_called_with({key_in}) + assert manager.contains(key_not_in) is None + manager._fill_if_empty.assert_called_with({key_not_in}) + + def test_contains_any(self, manager): + # Adding the key here is sufficient, because the cache gets filled + key_in = b"\x13AAA" + key_not_in = b"\x13BBB" + manager.bt_table.add_test_data({key_in}) + manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) + + manager.is_complete = True + assert manager.contains_any({key_in, key_not_in}) is True + manager._fill_if_empty.assert_called_with({key_in, key_not_in}) + assert manager.contains_any({key_not_in}) is False + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager.is_complete = False + assert manager.contains_any({key_in, key_not_in}) is True + manager._fill_if_empty.assert_called_with({key_in, key_not_in}) + assert manager.contains_any({key_not_in}) is None + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager._value_cache = None + assert manager.contains_any({key_in, key_not_in}) is None + manager._fill_if_empty.assert_called_with({key_in, key_not_in}) + assert manager.contains_any({key_not_in}) is None + manager._fill_if_empty.assert_called_with({key_not_in}) + + def test_flush_if_timer_over(self, manager): + tp = TP("a_topic", partition=19) + tp2 = TP("a_topic", partition=0) + time.time = MagicMock(return_value=0) + manager.bt_table.mutate_rows = MagicMock( + return_value=[TestResponse(404)] + ) + row_mock = MagicMock() + row_mock.row_key = b"\x13AAA" + manager._mutations = { + row_mock.row_key: (row_mock, "some_row_mutation") + } + + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): + assert manager.flush_if_timer_over(tp) is True + assert manager._last_flush == {tp.partition: 0} + assert manager.flush_if_timer_over(tp) is False + + + with patch( + "faust.stores.bigtable.time.time", + MagicMock(return_value=manager._mut_freq), + ): + assert manager.flush_if_timer_over(tp2) is True + assert manager._last_flush == { + tp2.partition: manager._mut_freq, + tp.partition: 0, + } + + assert manager.flush_if_timer_over(tp) is True + assert len(manager._mutations) == 1 # Not dropped, due to ERR. 404 + assert manager._last_flush == { + tp2.partition: manager._mut_freq, + tp.partition: manager._mut_freq, + } + assert manager.flush_if_timer_over(tp) is False + + manager._last_flush = {} + manager.bt_table.mutate_rows = MagicMock( + return_value=[TestResponse(0)] + ) + + with patch( + "faust.stores.bigtable.time.time", + MagicMock(return_value=manager._mut_freq), + ): + assert manager.flush_if_timer_over(tp) is True + assert manager._last_flush == {tp.partition: manager._mut_freq} + assert len(manager._mutations) == 0 + + def test_flush_if_timer_over_on_max_count(self, manager): + tp = TP("a_topic", partition=19) + row_mock = MagicMock() + row_mock.row_key = b"\x13AAA" + manager._mutations = { + row_mock.row_key: (row_mock, "some_row_mutation") + } + manager._max_mutations = 1 + manager._last_flush = {tp.partition: 999999999999} + manager.bt_table.mutate_rows = MagicMock( + return_value=[TestResponse(0)] + ) + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): + assert manager.flush_if_timer_over(tp) is True + + def test_set_mutation(self, manager): + row_mock = MagicMock() + row_mock.delete = MagicMock() + row_mock.set_cell = MagicMock() + row_mock.row_key = b"\x13AAA" + + manager.bt_table.direct_row = MagicMock(return_value=row_mock) + + assert len(manager._mutations) == 0 + manager._set_mutation(row_mock.row_key, "new_value") + manager.bt_table.direct_row.assert_called_once_with(row_mock.row_key) + row_mock.set_cell.assert_called_once_with( + "FaustColumnFamily", "DATA", "new_value" + ) + assert manager._mutations[row_mock.row_key][1] == "new_value" + assert len(manager._mutations) == 1 + + manager._set_mutation(row_mock.row_key, None) + row_mock.delete.assert_called_once() + assert manager._mutations[row_mock.row_key][1] is None + assert len(manager._mutations) == 1 + + +class TestBigTableStore: + TEST_KEY1 = b"TEST_KEY1" + TEST_KEY2 = b"TEST_KEY2" + TEST_KEY3 = b"TEST_KEY3" + + @pytest.fixture() + def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") bt.column_family.MaxVersionsGCRule = MagicMock( @@ -194,44 +539,6 @@ def table_name_gen(table): @pytest.fixture() def store(self, bt_imports): - class BigTableMock: - def __init__(self) -> None: - self.data = {} - self.read_row = MagicMock(wraps=self._read_row) - self.read_rows = MagicMock(wraps=self._read_rows) - - def _read_row(self, key: bytes, **kwargs): - res = self.data.get(key, None) - cell_wrapper = MagicMock() - cell_wrapper.value = res - row_wrapper = [cell_wrapper] - if res is None: - return res - row = MagicMock() - row.row_key = key - row.to_dict = MagicMock(return_value={"x": row_wrapper}) - return row - - def _read_rows(self, row_set, **kwargs): - for k in row_set.keys: - res = None - if b"_*_" in k: - for key in self.data.keys(): - start, end = k.split(b"_*_") - if start <= key < end: - yield self._read_row(key) - continue - else: - res = self._read_row(k) - if res is None: - continue - else: - yield res - - def add_test_data(self, keys): - for k in keys: - self.data[k] = k - with patch("faust.stores.bigtable.BT", bt_imports): options = {} options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" @@ -714,10 +1021,6 @@ def test_set_persisted_offset(self, store): ) def test_persist_changelog_batch(self, store): - class TestResponse: - def __init__(self, code) -> None: - self.code = code - # Scenario 1: no failure store.bt_table.mutate_rows = MagicMock( return_value=[TestResponse(0)] * 10 From 217ef41eb1b48ef413204a229dafb09529076bf5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 16:01:22 +0100 Subject: [PATCH 291/616] fixed tests --- tests/unit/stores/test_bigtable.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 9834c6b49..0323fbcba 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -552,15 +552,23 @@ def store(self, bt_imports): def test_bigtable_bigtable_get_on_empty(self, store): store._cache.get = MagicMock(return_value=None) + store._cache.is_complete = True return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.get.assert_called_with(self.TEST_KEY1) + assert return_value is None + store.bt_table.read_row.assert_not_called + + store._cache.is_complete = False + return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.get.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) - store._cache.get.assert_called_once_with(self.TEST_KEY1) assert return_value is None def test_bigtable_bigtable_get_cache_miss(self, store): store._cache.get = MagicMock(return_value=None) + store._cache.is_complete = False store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_get(self.TEST_KEY1) store._cache.get.assert_called_once_with(self.TEST_KEY1) From b4959c6047684e6de6199e537f4526f30de3d693 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 16:02:54 +0100 Subject: [PATCH 292/616] fixed is_complete in bigtable_get again --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 519c9b469..0e2f156c1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -350,7 +350,7 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes) -> Optional[bytes]: value = self._cache.get(key) - if value is not None: + if value is not None or self._cache.is_complete: return value else: res = self.bt_table.read_row(key, filter_=self.row_filter) From 2bdffa9bc43ab3b0b5bee1b63102fec20eb40dec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 17:25:14 +0100 Subject: [PATCH 293/616] added tests for iterkeys --- tests/unit/stores/test_bigtable.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 0323fbcba..f3b1d538b 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -362,7 +362,6 @@ def test_flush_if_timer_over(self, manager): assert manager._last_flush == {tp.partition: 0} assert manager.flush_if_timer_over(tp) is False - with patch( "faust.stores.bigtable.time.time", MagicMock(return_value=manager._mut_freq), @@ -433,6 +432,23 @@ def test_set_mutation(self, manager): assert manager._mutations[row_mock.row_key][1] is None assert len(manager._mutations) == 1 + def test_iterkeys(self, manager): + key_in = b"\x13AAA" + manager.bt_table.add_test_data({key_in}) + manager._fill_if_empty_and_yield = MagicMock(wraps=manager._fill_if_empty_and_yield) + + res = list(manager.iterkeys()) + manager._fill_if_empty_and_yield.assert_not_called() + assert res == [] # cache should not be filled yet + + res = list(manager.iterkeys({key_in})) + manager._fill_if_empty_and_yield.assert_called_once_with({key_in}) + assert key_in in res + + res = list(manager.iterkeys()) + assert manager._fill_if_empty_and_yield.call_count == 1 + assert key_in in res + class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" From 28d744a04fa7a9a4664194c11951b946bba401b2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 17:29:42 +0100 Subject: [PATCH 294/616] fixed testcases --- tests/unit/stores/test_bigtable.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index f3b1d538b..beb98f1f6 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -911,17 +911,14 @@ def test_iteritems(self, store): def test_iterkeys_with_complete_cache(self, store): store._cache.is_complete = True store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache._fill_if_empty = MagicMock() - keys_in_cache = [] keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY1, 1)) keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY2, 2)) keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + store._cache.iterkeys = MagicMock(return_value=keys_in_cache) - store._cache._value_cache = MagicMock() - store._cache._value_cache.keys = MagicMock(return_value=keys_in_cache) all_res = sorted(store._iterkeys()) - store._cache._fill_if_empty.assert_called_once_with( + store._cache.iterkeys.assert_called_once_with( { store._get_partition_prefix(1), store._get_partition_prefix(3), @@ -936,7 +933,7 @@ def test_iterkeys_with_complete_cache(self, store): def test_iterkeys_with_no_complete_cache(self, store): store._cache.is_complete = False store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache._fill_if_empty = MagicMock() + store._cache.iterkeys = MagicMock() keys_in_store = [] keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) @@ -944,7 +941,7 @@ def test_iterkeys_with_no_complete_cache(self, store): store.bt_table.add_test_data(keys_in_store) all_res = sorted(store._iterkeys()) - store._cache._fill_if_empty.assert_not_called() + store._cache.iterkeys.assert_not_called() assert all_res == [ self.TEST_KEY1, self.TEST_KEY3, From b76f4a96adf03db9b41f7fb3d9b02f2da6cd02f0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 6 Dec 2022 17:29:57 +0100 Subject: [PATCH 295/616] faster iterkeys --- faust/stores/bigtable.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0e2f156c1..c62e43566 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,4 +1,5 @@ """BigTable storage.""" +from collections import deque import logging import random import time @@ -116,7 +117,16 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self._init_mutation_buffer(options) self._filled_partitions: Set[int] = set() - def _fill_if_empty(self, bt_keys: Set[bytes]): + def _fill_if_empty(self, bt_keys): + deque(self._fill_if_empty_and_yield(bt_keys), maxlen=0) + + def iterkeys(self, bt_keys: Optional[Set[bytes]] = None) -> Iterator: + if self._value_cache is not None: + yield from self._value_cache.keys() + if bt_keys is not None: + yield from self._fill_if_empty_and_yield(bt_keys) + + def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION partitions = set() for k in bt_keys: @@ -146,6 +156,7 @@ def _fill_if_empty(self, bt_keys: Set[bytes]): else: value = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache[row.row_key] = value + yield row.row_key self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: @@ -575,8 +586,7 @@ def _iterkeys(self) -> Iterator[bytes]: keys = set() for p in partitions: keys.add(self._get_partition_prefix(p)) - self._cache._fill_if_empty(keys) - for k in self._cache._value_cache.keys(): + for k in self._cache.iterkeys(keys): yield self._remove_partition_prefix(k) else: row_set = BT.RowSet() From a57335c6f104df502179da8888d5e242f9375fb0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 12 Dec 2022 08:09:26 +0100 Subject: [PATCH 296/616] moved log and added test --- faust/stores/bigtable.py | 8 ++++---- tests/unit/stores/test_bigtable.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c62e43566..00bf6e7cd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -141,11 +141,11 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): start_key=chr(partition), end_key=chr(partition + 1) ) - self.log.info( - f"BigTableStore: Filling cache for {self.bt_table.name} " - f"and partitions {partitions_to_fill}" - ) if self._value_cache is not None: + self.log.info( + f"BigTableStore: Filling cache for {self.bt_table.name} " + f"and partitions {partitions_to_fill}" + ) for row in self.bt_table.read_rows( row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) ): diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index beb98f1f6..36345409d 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -449,6 +449,18 @@ def test_iterkeys(self, manager): assert manager._fill_if_empty_and_yield.call_count == 1 assert key_in in res + def test_fill_if_empty_and_yield(self, manager): + manager.bt_table.add_test_data({b"\x13AAA"}) + res = list(manager._fill_if_empty_and_yield({b"\x13AAA"})) + manager.bt_table.read_rows.assert_called() + assert res == [b"\x13AAA"] + + manager._value_cache = None + manager.bt_table.read_rows.reset_mock() + res = list(manager._fill_if_empty_and_yield({b"\x13AAA"})) + assert res == [] + manager.bt_table.read_rows.assert_not_called() + class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" From abd79d6068015ca69ab449338dce5be6a2a5ed1e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Dec 2022 10:17:35 +0100 Subject: [PATCH 297/616] fixe tests --- tests/unit/stores/test_bigtable.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 36345409d..349bdc244 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -13,7 +13,7 @@ from faust.types.tuples import TP -class TestResponse: +class MyTestResponse: def __init__(self, code) -> None: self.code = code @@ -346,7 +346,7 @@ def test_flush_if_timer_over(self, manager): tp2 = TP("a_topic", partition=0) time.time = MagicMock(return_value=0) manager.bt_table.mutate_rows = MagicMock( - return_value=[TestResponse(404)] + return_value=[MyTestResponse(404)] ) row_mock = MagicMock() @@ -382,7 +382,7 @@ def test_flush_if_timer_over(self, manager): manager._last_flush = {} manager.bt_table.mutate_rows = MagicMock( - return_value=[TestResponse(0)] + return_value=[MyTestResponse(0)] ) with patch( @@ -403,7 +403,7 @@ def test_flush_if_timer_over_on_max_count(self, manager): manager._max_mutations = 1 manager._last_flush = {tp.partition: 999999999999} manager.bt_table.mutate_rows = MagicMock( - return_value=[TestResponse(0)] + return_value=[MyTestResponse(0)] ) with patch( "faust.stores.bigtable.time.time", MagicMock(return_value=0) @@ -451,6 +451,7 @@ def test_iterkeys(self, manager): def test_fill_if_empty_and_yield(self, manager): manager.bt_table.add_test_data({b"\x13AAA"}) + res = list(manager._fill_if_empty_and_yield({b"\x13AAA"})) manager.bt_table.read_rows.assert_called() assert res == [b"\x13AAA"] @@ -959,7 +960,7 @@ def test_iterkeys_with_no_complete_cache(self, store): self.TEST_KEY3, ] - def test_iteritems(self, store): + def test_itervalues(self, store): keys_in_store = [] keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) @@ -1056,7 +1057,7 @@ def test_set_persisted_offset(self, store): def test_persist_changelog_batch(self, store): # Scenario 1: no failure store.bt_table.mutate_rows = MagicMock( - return_value=[TestResponse(0)] * 10 + return_value=[MyTestResponse(0)] * 10 ) store.log = MagicMock() store.log.error = MagicMock() @@ -1084,7 +1085,7 @@ def test_persist_changelog_batch(self, store): store.set_persisted_offset.reset_mock() store.bt_table.mutate_rows.reset_mock() store.bt_table.mutate_rows = MagicMock( - return_value=[TestResponse(404)] + return_value=[MyTestResponse(404)] ) store._persist_changelog_batch( ["row1", "row2", "etc..."], offset_batch From 0beb72a48a30063c5d86e27963fa63e31edaffa2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 11:23:14 +0100 Subject: [PATCH 298/616] added feature for custom lazy loading of key prefixes --- faust/stores/bigtable.py | 90 +++++++++---------- tests/unit/stores/test_bigtable.py | 137 ++++++++++++++++------------- 2 files changed, 118 insertions(+), 109 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 00bf6e7cd..e92358280 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -112,25 +112,26 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.is_complete = False self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table + self.custom_partitioning = options.get(BigTableStore.CUSTOM_CACHE_PARTITIONING_KEY, None) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_mutation_buffer(options) - self._filled_partitions: Set[int] = set() + self._filled_partitions: Set[bytes] = set() def _fill_if_empty(self, bt_keys): + # This is a hack, that enables iterating over all results + # without saving them in memory deque(self._fill_if_empty_and_yield(bt_keys), maxlen=0) - def iterkeys(self, bt_keys: Optional[Set[bytes]] = None) -> Iterator: - if self._value_cache is not None: - yield from self._value_cache.keys() - if bt_keys is not None: - yield from self._fill_if_empty_and_yield(bt_keys) + def _partition_from_key(self, bt_key): + if self.custom_partitioning is not None: + return self.custom_partitioning(bt_key) + return bt_key[0].to_bytes(1, "little") def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): - # THIS ONLY WORKS IF THE FIRST BYTE OF THE KEY IS THE PARTITION partitions = set() for k in bt_keys: - partitions.add(k[0]) + partitions.add(self._partition_from_key(bt_key=k)) partitions_to_fill = partitions.difference(self._filled_partitions) if len(partitions_to_fill) == 0: return @@ -138,7 +139,7 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): row_set = BT.RowSet() for partition in partitions_to_fill: row_set.add_row_range_from_keys( - start_key=chr(partition), end_key=chr(partition + 1) + start_key=partition, end_key=partition, end_inclusive=True ) if self._value_cache is not None: @@ -160,12 +161,13 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: - value = None self._fill_if_empty({bt_key}) if self._value_cache is not None: if bt_key in self._value_cache.keys(): - value = self._value_cache[bt_key] - return value + return self._value_cache[bt_key] + else: + return self._mutations.get(bt_key, (None, None))[1] + return None def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: @@ -189,26 +191,19 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self._value_cache is not None: - - res = bt_key in self._value_cache.keys() - if self.is_complete: - return res - elif res is True: - return True - return None + if self._value_cache is None: + if bt_key not in self._mutations.keys(): + return False + return self._mutations[bt_key][1] is not None + return bt_key in self._value_cache.keys() def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: self._fill_if_empty(key_set) - if self._value_cache is not None: - res = not self._value_cache.keys().isdisjoint(key_set) - if self.is_complete: - return res - elif res is True: - return True - # No assumption possible - return None + return not self._value_cache.keys().isdisjoint(key_set) + else: + mutations = key_set.intersection(self._mutations.keys()) + return any(mut[1] is not None for mut in mutations) def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() @@ -296,6 +291,7 @@ class BigTableStore(base.SerializedStore): VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" + CUSTOM_CACHE_PARTITIONING_KEY = "custom_cache_partitioning_key" def __init__( self, @@ -360,9 +356,8 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - value = self._cache.get(key) - if value is not None or self._cache.is_complete: - return value + if self._cache.contains(key): + return self._cache.get(key) else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: @@ -582,23 +577,24 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() partitions = self._active_partitions() - if self._cache.is_complete: - keys = set() - for p in partitions: - keys.add(self._get_partition_prefix(p)) - for k in self._cache.iterkeys(keys): + row_set = BT.RowSet() + for partition in partitions: + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) + + found_mutations = set() + for k, mut in self._cache._mutations.items(): + if mut[1] is not None: yield self._remove_partition_prefix(k) - else: - row_set = BT.RowSet() - for partition in partitions: - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - yield self._remove_partition_prefix(row.row_key) + found_mutations.add(k) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + if row.row_key in found_mutations: + continue + yield self._remove_partition_prefix(row.row_key) end = time.time() self.log.info( f"Finished iterkeys for {self.table_name} in {end - start}s" diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 349bdc244..77dc243e9 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -32,12 +32,17 @@ def __init__(self) -> None: def _add_row_key(self, key): self.keys.add(key) - def _add_row_range_from_keys(self, start_key: bytes, end_key: bytes): + def _add_row_range_from_keys( + self, start_key: bytes, end_key: bytes, end_inclusive=False + ): if isinstance(start_key, str): start_key = start_key.encode() if isinstance(end_key, str): end_key = end_key.encode() - self.keys.add(b"".join([start_key, b"_*_", end_key])) + if end_inclusive: + self.keys.add(b"".join([start_key, b"_*ei_", end_key])) + else: + self.keys.add(b"".join([start_key, b"_*_", end_key])) class BigTableMock: @@ -65,7 +70,13 @@ def _read_rows(self, row_set, **kwargs): if b"_*_" in k: for key in self.data.keys(): start, end = k.split(b"_*_") - if start <= key < end: + if start <= key[: len(end)] < end: + yield self._read_row(key) + continue + elif b"_*ei_" in k: + for key in self.data.keys(): + start, end = k.split(b"_*ei_") + if start <= key[: len(end)] <= end: yield self._read_row(key) continue else: @@ -214,15 +225,15 @@ def test_fill_if_empty(self, manager): # Scenario 1: Everything empty manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {19} + assert manager._filled_partitions == {b"\x13"} manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {19} + assert manager._filled_partitions == {b"\x13"} manager._fill_if_empty({b"\x10XXX"}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._filled_partitions == {19, 16} + assert manager._filled_partitions == {b"\x13", b"\x10"} assert manager.contains(key) def test_fill_if_empty_with_mutation(self, manager): @@ -238,7 +249,6 @@ def test_get(self, manager): key_in = b"\x13AAA" key_not_in = b"\x13BBB" manager.bt_table.add_test_data({key_in}) - manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) res = manager.get(key_in) @@ -307,13 +317,19 @@ def test_contains(self, manager): manager.is_complete = False assert manager.contains(key_in) is True manager._fill_if_empty.assert_called_with({key_in}) - assert manager.contains(key_not_in) is None + assert manager.contains(key_not_in) is False manager._fill_if_empty.assert_called_with({key_not_in}) manager._value_cache = None - assert manager.contains(key_in) is None + assert manager.contains(key_in) is False + manager._fill_if_empty.assert_called_with({key_in}) + assert manager.contains(key_not_in) is False + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager._mutations = {key_in: (key_in, key_in)} + assert manager.contains(key_in) is True manager._fill_if_empty.assert_called_with({key_in}) - assert manager.contains(key_not_in) is None + assert manager.contains(key_not_in) is False manager._fill_if_empty.assert_called_with({key_not_in}) def test_contains_any(self, manager): @@ -332,13 +348,19 @@ def test_contains_any(self, manager): manager.is_complete = False assert manager.contains_any({key_in, key_not_in}) is True manager._fill_if_empty.assert_called_with({key_in, key_not_in}) - assert manager.contains_any({key_not_in}) is None + assert manager.contains_any({key_not_in}) is False manager._fill_if_empty.assert_called_with({key_not_in}) manager._value_cache = None - assert manager.contains_any({key_in, key_not_in}) is None + assert manager.contains_any({key_in, key_not_in}) is False manager._fill_if_empty.assert_called_with({key_in, key_not_in}) - assert manager.contains_any({key_not_in}) is None + assert manager.contains_any({key_not_in}) is False + manager._fill_if_empty.assert_called_with({key_not_in}) + + manager._mutations = {key_in: (key_in, key_in)} + assert manager.contains_any({key_in, key_not_in}) is True + manager._fill_if_empty.assert_called_with({key_in, key_not_in}) + assert manager.contains_any({key_not_in}) is False manager._fill_if_empty.assert_called_with({key_not_in}) def test_flush_if_timer_over(self, manager): @@ -432,22 +454,24 @@ def test_set_mutation(self, manager): assert manager._mutations[row_mock.row_key][1] is None assert len(manager._mutations) == 1 - def test_iterkeys(self, manager): - key_in = b"\x13AAA" - manager.bt_table.add_test_data({key_in}) - manager._fill_if_empty_and_yield = MagicMock(wraps=manager._fill_if_empty_and_yield) + # def test_iterkeys(self, manager): + # key_in = b"\x13AAA" + # manager.bt_table.add_test_data({key_in}) + # manager._fill_if_empty_and_yield = MagicMock( + # wraps=manager._fill_if_empty_and_yield + # ) - res = list(manager.iterkeys()) - manager._fill_if_empty_and_yield.assert_not_called() - assert res == [] # cache should not be filled yet + # res = list(manager.iterkeys()) + # manager._fill_if_empty_and_yield.assert_not_called() + # assert res == [] # cache should not be filled yet - res = list(manager.iterkeys({key_in})) - manager._fill_if_empty_and_yield.assert_called_once_with({key_in}) - assert key_in in res + # res = list(manager.iterkeys({key_in})) + # manager._fill_if_empty_and_yield.assert_called_once_with({key_in}) + # assert key_in in res - res = list(manager.iterkeys()) - assert manager._fill_if_empty_and_yield.call_count == 1 - assert key_in in res + # res = list(manager.iterkeys()) + # assert manager._fill_if_empty_and_yield.call_count == 1 + # assert key_in in res def test_fill_if_empty_and_yield(self, manager): manager.bt_table.add_test_data({b"\x13AAA"}) @@ -581,26 +605,21 @@ def store(self, bt_imports): def test_bigtable_bigtable_get_on_empty(self, store): store._cache.get = MagicMock(return_value=None) - store._cache.is_complete = True - return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_with(self.TEST_KEY1) - assert return_value is None - store.bt_table.read_row.assert_not_called - - store._cache.is_complete = False + store._cache.contains = MagicMock(return_value=False) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_with(self.TEST_KEY1) - store.bt_table.read_row.assert_called_once_with( + store._cache.contains.assert_called_with(self.TEST_KEY1) + store._cache.get.assert_not_called() + store.bt_table.read_row.assert_called_with( self.TEST_KEY1, filter_="a_filter" ) assert return_value is None def test_bigtable_bigtable_get_cache_miss(self, store): store._cache.get = MagicMock(return_value=None) - store._cache.is_complete = False + store._cache.contains = MagicMock(return_value=False) store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_once_with(self.TEST_KEY1) + store._cache.get.assert_not_called() store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) @@ -608,6 +627,7 @@ def test_bigtable_bigtable_get_cache_miss(self, store): def test_bigtable_bigtable_get_cache_hit(self, store): store.bt_table.add_test_data([self.TEST_KEY1]) + store._cache.contains = MagicMock(return_value=True) store._cache.get = MagicMock(return_value=b"cache_res") return_value = store._bigtable_get(self.TEST_KEY1) store._cache.get.assert_called_once_with(self.TEST_KEY1) @@ -921,44 +941,37 @@ def test_iteritems(self, store): (self.TEST_KEY3, keys_in_store[2]), ] - def test_iterkeys_with_complete_cache(self, store): - store._cache.is_complete = True + def test_iterkeys(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) - keys_in_cache = [] - keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY2, 2)) - keys_in_cache.append(store._get_key_with_partition(self.TEST_KEY3, 3)) - store._cache.iterkeys = MagicMock(return_value=keys_in_cache) + keys_in_store = [] + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + store.bt_table.add_test_data(keys_in_store) all_res = sorted(store._iterkeys()) - store._cache.iterkeys.assert_called_once_with( - { - store._get_partition_prefix(1), - store._get_partition_prefix(3), - } - ) assert all_res == [ self.TEST_KEY1, - self.TEST_KEY2, self.TEST_KEY3, ] - def test_iterkeys_with_no_complete_cache(self, store): - store._cache.is_complete = False + def test_iterkeys_with_mutattions(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache.iterkeys = MagicMock() keys_in_store = [] - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + k1 = store._get_key_with_partition(self.TEST_KEY1, 1) + k2 = store._get_key_with_partition(self.TEST_KEY2, 2) + k3 = store._get_key_with_partition(self.TEST_KEY3, 3) + keys_in_store.append(k1) + keys_in_store.append(k2) + keys_in_store.append(k3) store.bt_table.add_test_data(keys_in_store) + store._cache._mutations = { + k1: (k1, None), + k3: (k3, "HAS SOME VALUE"), + } all_res = sorted(store._iterkeys()) - store._cache.iterkeys.assert_not_called() - assert all_res == [ - self.TEST_KEY1, - self.TEST_KEY3, - ] + assert all_res == [self.TEST_KEY3] def test_itervalues(self, store): keys_in_store = [] From be59e01a0bbee1ded439f352acf2405204329220 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 11:36:39 +0100 Subject: [PATCH 299/616] added test for custom partitioning --- tests/unit/stores/test_bigtable.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 77dc243e9..c03070856 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -236,6 +236,29 @@ def test_fill_if_empty(self, manager): assert manager._filled_partitions == {b"\x13", b"\x10"} assert manager.contains(key) + def test_fill_if_empty_with_custom_partitioner(self, manager): + def custom_cache_partitioner(key: bytes): + prefix_len = 1 # bytes + id_len = 2 # bytes + key_partition_len = prefix_len + id_len + return key[:key_partition_len] + manager.custom_partitioning = custom_cache_partitioner + key = b"\x13PPAAAAAAAA" + manager.bt_table.add_test_data({key}) + # Scenario 1: Everything empty + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._filled_partitions == {b"\x13PP"} + + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._filled_partitions == {b"\x13PP"} + + manager._fill_if_empty({b"\x10XXX"}) + assert manager.bt_table.read_rows.call_count == 2 + assert manager._filled_partitions == {b"\x13PP", b"\x10XX"} + assert manager.contains(key) + def test_fill_if_empty_with_mutation(self, manager): key = b"\x13AAA" manager.bt_table.add_test_data({key}) From 049af84590228a2b00d085a2a1e03e4638476667 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 13:16:36 +0100 Subject: [PATCH 300/616] fixed contains calls in bt --- faust/stores/bigtable.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e92358280..0d92474a3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -143,10 +143,6 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): ) if self._value_cache is not None: - self.log.info( - f"BigTableStore: Filling cache for {self.bt_table.name} " - f"and partitions {partitions_to_fill}" - ) for row in self.bt_table.read_rows( row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) ): @@ -369,7 +365,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is not None: + if cache_contains is True: return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) @@ -381,7 +377,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is not None: + if cache_contains is True: return cache_contains rows = BT.RowSet() From 07cfd0fa666fb930857b0b9d22eb1f551332f907 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 13:18:23 +0100 Subject: [PATCH 301/616] fixed tests --- tests/unit/stores/test_bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index c03070856..2886dc01b 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -716,9 +716,9 @@ def test_bigtable_contains(self, store): store._cache.contains = MagicMock(return_value=False) return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() + store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") store._cache.delete.assert_not_called() - assert return_value is False + assert return_value is True def test_bigtable_contains_any(self, store): store.bt_table.add_test_data([self.TEST_KEY1]) From a66b253f20d3f3c1e74962bace33cfbee5528d55 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 13:53:17 +0100 Subject: [PATCH 302/616] added logging --- faust/stores/bigtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0d92474a3..4f51ce890 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -136,6 +136,7 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): if len(partitions_to_fill) == 0: return + start = time.time() row_set = BT.RowSet() for partition in partitions_to_fill: row_set.add_row_range_from_keys( @@ -154,6 +155,8 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): value = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache[row.row_key] = value yield row.row_key + end = time.time() + self.log.info(f"Finished fill for table {self.bt_table.name}:{partitions_to_fill} in {end-start}s") self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: @@ -590,7 +593,12 @@ def _iterkeys(self) -> Iterator[bytes]: ): if row.row_key in found_mutations: continue + if self._cache._value_cache is not None: + data = self.bigtable_exrtact_row_data(row) + # We don't want to set mutations here + self._cache._value_cache[row.row_key] = data yield self._remove_partition_prefix(row.row_key) + end = time.time() self.log.info( f"Finished iterkeys for {self.table_name} in {end - start}s" From b11a275ca0dcf1eba6faea14d87a35f5bad88354 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 19 Dec 2022 14:57:54 +0100 Subject: [PATCH 303/616] added cache partition to filled partitions after iterkeys --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4f51ce890..e8b72a5ed 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -597,6 +597,8 @@ def _iterkeys(self) -> Iterator[bytes]: data = self.bigtable_exrtact_row_data(row) # We don't want to set mutations here self._cache._value_cache[row.row_key] = data + cache_partition = self._cache._partition_from_key(row.row_key) + self._cache._filled_partitions.add(cache_partition) yield self._remove_partition_prefix(row.row_key) end = time.time() From 467bafe68318a2ee93d4e48ec16f9d1c4bacf9d1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 09:48:04 +0100 Subject: [PATCH 304/616] added log --- faust/stores/bigtable.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e8b72a5ed..bd3941984 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,9 +1,9 @@ """BigTable storage.""" -from collections import deque import logging import random import time import traceback +from collections import deque from typing import ( Any, Callable, @@ -112,7 +112,9 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.is_complete = False self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - self.custom_partitioning = options.get(BigTableStore.CUSTOM_CACHE_PARTITIONING_KEY, None) + self.custom_partitioning = options.get( + BigTableStore.CUSTOM_CACHE_PARTITIONING_KEY, None + ) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_mutation_buffer(options) @@ -156,7 +158,10 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._value_cache[row.row_key] = value yield row.row_key end = time.time() - self.log.info(f"Finished fill for table {self.bt_table.name}:{partitions_to_fill} in {end-start}s") + self.log.info( + "BigTableStore: Finished fill for table" + f"{self.bt_table.name}:{partitions_to_fill} in {end-start}s" + ) self._filled_partitions.update(partitions_to_fill) def get(self, bt_key: bytes) -> Optional[bytes]: @@ -597,7 +602,9 @@ def _iterkeys(self) -> Iterator[bytes]: data = self.bigtable_exrtact_row_data(row) # We don't want to set mutations here self._cache._value_cache[row.row_key] = data - cache_partition = self._cache._partition_from_key(row.row_key) + cache_partition = self._cache._partition_from_key( + row.row_key + ) self._cache._filled_partitions.add(cache_partition) yield self._remove_partition_prefix(row.row_key) From f652e71d23b4c3bd0ae4a670bc14c6d119b3742a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 11:53:26 +0100 Subject: [PATCH 305/616] fixed tests with custom preload filters --- tests/unit/stores/test_bigtable.py | 43 +++++++++--------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2886dc01b..f58dcf30e 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -160,12 +160,11 @@ def test_default__init__(self): test_manager = BigTableCacheManager(MagicMock(), {}, bigtable_mock) assert test_manager.bt_table == bigtable_mock - assert test_manager.is_complete is False assert test_manager._value_cache is None assert test_manager._mut_freq == 0 assert test_manager._last_flush == {} assert test_manager._mutations == {} - assert test_manager._filled_partitions == set() + assert test_manager._finished_preloads == set() def test_iscomplete__init__(self): bigtable_mock = BigTableMock() @@ -181,12 +180,11 @@ def test_iscomplete__init__(self): MagicMock(), options, bigtable_mock ) assert test_manager.bt_table == bigtable_mock - assert test_manager.is_complete is True assert isinstance(test_manager._value_cache, BigTableValueCache) assert test_manager._mut_freq == 0 assert test_manager._last_flush == {} assert test_manager._mutations == {} - assert test_manager._filled_partitions == set() + assert test_manager._finished_preloads == set() @pytest.fixture() def bt_imports(self): @@ -225,38 +223,35 @@ def test_fill_if_empty(self, manager): # Scenario 1: Everything empty manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {b"\x13"} + assert manager._finished_preloads == {b"\x13_***_"} manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {b"\x13"} + assert manager._finished_preloads == {b"\x13_***_"} manager._fill_if_empty({b"\x10XXX"}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._filled_partitions == {b"\x13", b"\x10"} + assert manager._finished_preloads == {b"\x13_***_", b"\x10_***_"} assert manager.contains(key) - def test_fill_if_empty_with_custom_partitioner(self, manager): - def custom_cache_partitioner(key: bytes): - prefix_len = 1 # bytes - id_len = 2 # bytes - key_partition_len = prefix_len + id_len - return key[:key_partition_len] - manager.custom_partitioning = custom_cache_partitioner + def test_fill_if_empty_with_pre_and_suffix(self, manager): + manager.preload_prefix = 3 + manager.preload_suffix = 1 + key = b"\x13PPAAAAAAAA" manager.bt_table.add_test_data({key}) # Scenario 1: Everything empty manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {b"\x13PP"} + assert manager._finished_preloads == {b"\x13PP_***_A"} manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._filled_partitions == {b"\x13PP"} + assert manager._finished_preloads == {b"\x13PP_***_A"} manager._fill_if_empty({b"\x10XXX"}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._filled_partitions == {b"\x13PP", b"\x10XX"} + assert manager._finished_preloads == {b"\x13PP_***_A", b"\x10XX_***_X"} assert manager.contains(key) def test_fill_if_empty_with_mutation(self, manager): @@ -331,13 +326,6 @@ def test_contains(self, manager): manager.bt_table.add_test_data({key_in}) manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) - manager.is_complete = True - assert manager.contains(key_in) is True - manager._fill_if_empty.assert_called_with({key_in}) - assert manager.contains(key_not_in) is False - manager._fill_if_empty.assert_called_with({key_not_in}) - - manager.is_complete = False assert manager.contains(key_in) is True manager._fill_if_empty.assert_called_with({key_in}) assert manager.contains(key_not_in) is False @@ -362,13 +350,6 @@ def test_contains_any(self, manager): manager.bt_table.add_test_data({key_in}) manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) - manager.is_complete = True - assert manager.contains_any({key_in, key_not_in}) is True - manager._fill_if_empty.assert_called_with({key_in, key_not_in}) - assert manager.contains_any({key_not_in}) is False - manager._fill_if_empty.assert_called_with({key_not_in}) - - manager.is_complete = False assert manager.contains_any({key_in, key_not_in}) is True manager._fill_if_empty.assert_called_with({key_in, key_not_in}) assert manager.contains_any({key_not_in}) is False From 6ce113ff019df41de3c7e87dcf46a6d6c8c4f709 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 11:53:47 +0100 Subject: [PATCH 306/616] added custom preload functionality --- faust/stores/bigtable.py | 72 +++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bd3941984..97371c3fe 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -15,6 +15,7 @@ Tuple, Union, ) +from google.cloud.bigtable.row_filters import RowFilterUnion, RowKeyRegexFilter try: # pragma: no cover from google.cloud.bigtable import column_family @@ -109,45 +110,65 @@ class BigTableCacheManager: _mutations: Dict[bytes, Tuple[BT.DirectRow, Optional[bytes]]] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: - self.is_complete = False self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - self.custom_partitioning = options.get( - BigTableStore.CUSTOM_CACHE_PARTITIONING_KEY, None + self.preload_prefix = options.get( + BigTableStore.CACHE_PRELOAD_PREFIX_LEN_KEY, + 1, # Default partition only + ) + self.preload_suffix = options.get( + BigTableStore.CACHE_PRELOAD_SUFFIX_LEN_KEY, 0 # Default skip ) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self._init_mutation_buffer(options) - self._filled_partitions: Set[bytes] = set() + self._finished_preloads: Set[bytes] = set() def _fill_if_empty(self, bt_keys): # This is a hack, that enables iterating over all results # without saving them in memory deque(self._fill_if_empty_and_yield(bt_keys), maxlen=0) - def _partition_from_key(self, bt_key): - if self.custom_partitioning is not None: - return self.custom_partitioning(bt_key) - return bt_key[0].to_bytes(1, "little") + def _preload_id_from_key(self, bt_key): + prefix = bt_key[: self.preload_prefix] + if self.preload_suffix == 0: + suffix = b"" + else: + suffix = bt_key[-self.preload_suffix :] + return b"_***_".join([prefix, suffix]) + + def _get_preload_rowset_and_filter(self, preload_ids): + row_set = BT.RowSet() + + filters = [] + for preload_id in preload_ids: + prefix, suffix = preload_id.split(b"_***_") + row_set.add_row_range_from_keys( + start_key=prefix, end_key=prefix, end_inclusive=True + ) + filters.append(RowKeyRegexFilter(b"".join([b".*", suffix]))) + if self.preload_suffix > 0: + row_filter = RowFilterUnion(filters=filters) + else: + row_filter = CellsColumnLimitFilter(1) + return row_set, row_filter def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): - partitions = set() + preload_ids = set() for k in bt_keys: - partitions.add(self._partition_from_key(bt_key=k)) - partitions_to_fill = partitions.difference(self._filled_partitions) - if len(partitions_to_fill) == 0: + preload_ids.add(self._preload_id_from_key(bt_key=k)) + preload_ids_todo = preload_ids.difference(self._finished_preloads) + if len(preload_ids_todo) == 0: return start = time.time() - row_set = BT.RowSet() - for partition in partitions_to_fill: - row_set.add_row_range_from_keys( - start_key=partition, end_key=partition, end_inclusive=True - ) if self._value_cache is not None: + row_set, row_filter = self._get_preload_rowset_and_filter( + preload_ids_todo + ) for row in self.bt_table.read_rows( - row_set=row_set, filter_=BT.CellsColumnLimitFilter(1) + row_set=row_set, filter_=row_filter ): if row.row_key in self._mutations.keys(): mutation_val = self._mutations[row.row_key][1] @@ -160,9 +181,9 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): end = time.time() self.log.info( "BigTableStore: Finished fill for table" - f"{self.bt_table.name}:{partitions_to_fill} in {end-start}s" + f"{self.bt_table.name}:{preload_ids_todo} in {end-start}s" ) - self._filled_partitions.update(partitions_to_fill) + self._finished_preloads.update(preload_ids_todo) def get(self, bt_key: bytes) -> Optional[bytes]: self._fill_if_empty({bt_key}) @@ -253,13 +274,12 @@ def _init_value_cache( ) -> Optional[Union[LRUCache, BigTableValueCache]]: enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) if enable: + # TODO Maybe we need to remove invalidation time and size ttl = options.get( BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) - if ttl == -1 and size is None: - self.is_complete = True else: self._value_cache = None @@ -296,6 +316,8 @@ class BigTableStore(base.SerializedStore): VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" CUSTOM_CACHE_PARTITIONING_KEY = "custom_cache_partitioning_key" + CACHE_PRELOAD_SUFFIX_LEN_KEY = "cache_preload_suffix_len_key" + CACHE_PRELOAD_PREFIX_LEN_KEY = "cache_preload_prefix_len_key" def __init__( self, @@ -602,10 +624,8 @@ def _iterkeys(self) -> Iterator[bytes]: data = self.bigtable_exrtact_row_data(row) # We don't want to set mutations here self._cache._value_cache[row.row_key] = data - cache_partition = self._cache._partition_from_key( - row.row_key - ) - self._cache._filled_partitions.add(cache_partition) + preload_id = self._cache._preload_id_from_key(row.row_key) + self._cache._finished_preloads.add(preload_id) yield self._remove_partition_prefix(row.row_key) end = time.time() From 76a2edc8391cd586c294a2bc54f9e0733913c4ea Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 12:26:15 +0100 Subject: [PATCH 307/616] fixed error with row filters --- faust/stores/bigtable.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 97371c3fe..7571b7d48 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -15,7 +15,7 @@ Tuple, Union, ) -from google.cloud.bigtable.row_filters import RowFilterUnion, RowKeyRegexFilter +from google.cloud.bigtable.row_filters import RowFilterChain, RowFilterUnion, RowKeyRegexFilter try: # pragma: no cover from google.cloud.bigtable import column_family @@ -141,14 +141,18 @@ def _get_preload_rowset_and_filter(self, preload_ids): row_set = BT.RowSet() filters = [] + suffix_in = set() for preload_id in preload_ids: prefix, suffix = preload_id.split(b"_***_") row_set.add_row_range_from_keys( start_key=prefix, end_key=prefix, end_inclusive=True ) - filters.append(RowKeyRegexFilter(b"".join([b".*", suffix]))) - if self.preload_suffix > 0: + if suffix not in suffix_in: + filters.append(RowKeyRegexFilter(b"".join([b".*", suffix]))) + if len(filters) > 1: row_filter = RowFilterUnion(filters=filters) + elif len(filters) == 1: + row_filter = RowFilterChain([CellsColumnLimitFilter(1), filters[0]]) else: row_filter = CellsColumnLimitFilter(1) return row_set, row_filter From d81794b94ac037f7b5b870c8e13af27a9666b76f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 13:43:11 +0100 Subject: [PATCH 308/616] faster return on contains because of preloaded organisations --- faust/stores/bigtable.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7571b7d48..be686f7c1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -215,24 +215,27 @@ def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition def contains(self, bt_key: bytes) -> Optional[bool]: - self._fill_if_empty({bt_key}) """ If we return None here, this means, that no assumption about the current key can be made. """ - if self._value_cache is None: - if bt_key not in self._mutations.keys(): - return False + if bt_key in self._mutations.keys(): return self._mutations[bt_key][1] is not None - return bt_key in self._value_cache.keys() + if self._value_cache is not None: + self._fill_if_empty({bt_key}) + return bt_key in self._value_cache.keys() + return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - self._fill_if_empty(key_set) if self._value_cache is not None: + self._fill_if_empty(key_set) return not self._value_cache.keys().isdisjoint(key_set) else: mutations = key_set.intersection(self._mutations.keys()) - return any(mut[1] is not None for mut in mutations) + found = any(mut[1] is not None for mut in mutations) + if found: + return True + return None def flush_if_timer_over(self, tp: TP) -> bool: now = time.time() @@ -399,7 +402,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is True: + if cache_contains is not None: return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) @@ -411,7 +414,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is True: + if cache_contains is not None: return cache_contains rows = BT.RowSet() From 2e3efedb99b00e86e9809baff8a707fbb873d3cd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 14:56:04 +0100 Subject: [PATCH 309/616] faster get and no unecessery filling of caches --- faust/stores/bigtable.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index be686f7c1..b9ef74c6d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -15,7 +15,11 @@ Tuple, Union, ) -from google.cloud.bigtable.row_filters import RowFilterChain, RowFilterUnion, RowKeyRegexFilter +from google.cloud.bigtable.row_filters import ( + RowFilterChain, + RowFilterUnion, + RowKeyRegexFilter, +) try: # pragma: no cover from google.cloud.bigtable import column_family @@ -152,7 +156,9 @@ def _get_preload_rowset_and_filter(self, preload_ids): if len(filters) > 1: row_filter = RowFilterUnion(filters=filters) elif len(filters) == 1: - row_filter = RowFilterChain([CellsColumnLimitFilter(1), filters[0]]) + row_filter = RowFilterChain( + [CellsColumnLimitFilter(1), filters[0]] + ) else: row_filter = CellsColumnLimitFilter(1) return row_set, row_filter @@ -190,12 +196,12 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._finished_preloads.update(preload_ids_todo) def get(self, bt_key: bytes) -> Optional[bytes]: - self._fill_if_empty({bt_key}) + if bt_key in self._mutations.keys(): + return self._mutations[bt_key][1] if self._value_cache is not None: + self._fill_if_empty({bt_key}) if bt_key in self._value_cache.keys(): return self._value_cache[bt_key] - else: - return self._mutations.get(bt_key, (None, None))[1] return None def set(self, bt_key: bytes, value: Optional[bytes]) -> None: @@ -227,14 +233,13 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + mutations = key_set.intersection(self._mutations.keys()) + found = any(mut[1] is not None for mut in mutations) + if found: + return True if self._value_cache is not None: self._fill_if_empty(key_set) return not self._value_cache.keys().isdisjoint(key_set) - else: - mutations = key_set.intersection(self._mutations.keys()) - found = any(mut[1] is not None for mut in mutations) - if found: - return True return None def flush_if_timer_over(self, tp: TP) -> bool: From 67ead3da77e9abe4adf7569e384ecbdf570453b9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 20 Dec 2022 16:28:40 +0100 Subject: [PATCH 310/616] only return true if cache really contains value --- faust/stores/bigtable.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b9ef74c6d..67335c5ba 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -298,7 +298,6 @@ def _init_value_cache( def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) # To prevent that all tables write at the same time - random_start_offset = random.randint(0, self._mut_freq) self._last_flush = ( {} ) # time.time() + self._mut_freq - random_start_offset @@ -407,7 +406,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is not None: + if cache_contains is True: return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) @@ -419,7 +418,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is not None: + if cache_contains is True: return cache_contains rows = BT.RowSet() @@ -638,7 +637,10 @@ def _iterkeys(self) -> Iterator[bytes]: self._cache._value_cache[row.row_key] = data preload_id = self._cache._preload_id_from_key(row.row_key) self._cache._finished_preloads.add(preload_id) - yield self._remove_partition_prefix(row.row_key) + partition = row.row_key[0] + key = self._remove_partition_prefix(row.row_key) + self._cache.set_partition(key, partition) + yield key end = time.time() self.log.info( From c7ede392852eb097be2afd8012c5837b7033fb73 Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Thu, 22 Dec 2022 18:09:56 +0100 Subject: [PATCH 311/616] first version of global global tables --- faust/assignor/partition_assignor.py | 5 ++++- faust/tables/base.py | 2 ++ tests/unit/stores/test_rocksdb.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index be19074f4..a52966ab8 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -327,7 +327,10 @@ def _global_table_standby_assignments( assignment.actives.get(changelog_topic_name, []) ) # Only add those partitions as standby which aren't active - standby_partitions = all_partitions - active_partitions + if not table.is_global_global: + standby_partitions = all_partitions - active_partitions + else: + standby_partitions = all_partitions assignment.standbys[changelog_topic_name] = list(standby_partitions) # We add all_partitions as active so they are recovered # in the beginning. diff --git a/faust/tables/base.py b/faust/tables/base.py index b2e11cdd9..f5955a582 100644 --- a/faust/tables/base.py +++ b/faust/tables/base.py @@ -122,6 +122,7 @@ def __init__( use_partitioner: bool = False, on_window_close: Optional[WindowCloseCallback] = None, is_global: bool = False, + is_global_global: bool = False, **kwargs: Any, ) -> None: Service.__init__(self, **kwargs) @@ -144,6 +145,7 @@ def __init__( self._on_window_close = on_window_close self.last_closed_window = 0.0 self.is_global = is_global + self.is_global_global = is_global_global assert self.recovery_buffer_size > 0 and self.standby_buffer_size > 0 self.options = options diff --git a/tests/unit/stores/test_rocksdb.py b/tests/unit/stores/test_rocksdb.py index bde79685b..f51ed3085 100644 --- a/tests/unit/stores/test_rocksdb.py +++ b/tests/unit/stores/test_rocksdb.py @@ -280,6 +280,7 @@ def test__get__has_event(self, *, store, current_event): db.get.return_value = b"value" store.table = Mock(name="table") store.table.is_global = False + store.table.is_global_global = False store.table.use_partitioner = False assert store._get(b"key") == b"value" @@ -312,6 +313,7 @@ def test__get__has_event_value_diff_partition(self, *, store, current_event): store.table = Mock(name="table") store.table.is_global = False + store.table.is_global_global = False store.table.use_partitioner = False # A _get call from a stream, to a non-global, non-partitioner, table @@ -320,6 +322,14 @@ def test__get__has_event_value_diff_partition(self, *, store, current_event): assert store._get(b"key") is None store.table.is_global = True + store.table.is_global_global = True + store.table.use_partitioner = False + + # A global table ignores the event partition and pulls from the proper db + assert store._get(b"key") == b"value" + + store.table.is_global = False + store.table.is_global_global = True store.table.use_partitioner = False # A global table ignores the event partition and pulls from the proper db @@ -602,6 +612,11 @@ def test__dbs_for_actives(self, *, store, table): table.is_global = True assert list(store._dbs_for_actives()) == [dbs[1], dbs[2], dbs[3]] + # Global Global Table + table.is_global = True + table.is_global_global = True + assert list(store._dbs_for_actives()) == [dbs[1], dbs[2], dbs[3]] + def test__size(self, *, store): dbs = self._setup_keys( db1=[ From 60e00ab525fbc4745a1b0c428a0e2456e8973a4b Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Thu, 22 Dec 2022 18:13:15 +0100 Subject: [PATCH 312/616] add GlobalGlobalTable Object --- faust/app/base.py | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/faust/app/base.py b/faust/app/base.py index d564469b7..5111b7f69 100644 --- a/faust/app/base.py +++ b/faust/app/base.py @@ -1227,6 +1227,56 @@ def GlobalTable( ) return cast(GlobalTableT, gtable.using_window(window) if window else gtable) + def GlobalGlobalTable( + self, + name: str, + *, + default: Callable[[], Any] = None, + window: Optional[WindowT] = None, + partitions: Optional[int] = None, + help: Optional[str] = None, + **kwargs: Any, + ) -> GlobalTableT: + """Define new global table. + + Arguments: + name: Name used for global table, note that two global tables + living in the same application cannot have the same name. + + default: A callable, or type that will return a default valu + for keys missing in this global table. + window: A windowing strategy to wrap this window in. + + Examples: + >>> gtable = app.GlobalTable('user_to_amount', default=int) + >>> gtable['George'] + 0 + >>> gtable['Elaine'] += 1 + >>> gtable['Elaine'] += 1 + >>> gtable['Elaine'] + 2 + """ + gtable = self.tables.add( + cast( + GlobalTableT, + self.conf.GlobalTable( # type: ignore + self, + name=name, + default=default, + beacon=self.tables.beacon, + partitions=partitions, + # we want to apply standby changes + # as they come min (using 1 buffer size). + standby_buffer_size=1, + is_global=True, + is_global_global=True, + help=help, + **kwargs, + ), + ) + ) + return cast(GlobalTableT, gtable.using_window(window) if window else gtable) + def SetTable( self, name: str, From 6be5619fe1009353d54509fc23b2450d607639ff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Dec 2022 09:22:36 +0100 Subject: [PATCH 313/616] removed contains from get --- faust/stores/bigtable.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 67335c5ba..e99ed3072 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,5 @@ """BigTable storage.""" import logging -import random import time import traceback from collections import deque @@ -393,7 +392,7 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._cache.contains(key): + if self._cache.contains(key) is True: return self._cache.get(key) else: res = self.bt_table.read_row(key, filter_=self.row_filter) @@ -412,8 +411,6 @@ def _bigtable_contains(self, key: bytes) -> bool: row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: return True - # Just to be sure - self._cache.delete(key) return False def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: From 32c9ff5790a98a4016feb25572bf2480c0d3f8a1 Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Fri, 23 Dec 2022 11:57:45 +0100 Subject: [PATCH 314/616] have global global as attribute of global table --- faust/app/base.py | 51 +---------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/faust/app/base.py b/faust/app/base.py index 5111b7f69..a4ed0f187 100644 --- a/faust/app/base.py +++ b/faust/app/base.py @@ -1220,56 +1220,7 @@ def GlobalTable( # as they come min (using 1 buffer size). standby_buffer_size=1, is_global=True, - help=help, - **kwargs, - ), - ) - ) - return cast(GlobalTableT, gtable.using_window(window) if window else gtable) - - def GlobalGlobalTable( - self, - name: str, - *, - default: Callable[[], Any] = None, - window: Optional[WindowT] = None, - partitions: Optional[int] = None, - help: Optional[str] = None, - **kwargs: Any, - ) -> GlobalTableT: - """Define new global table. - - Arguments: - name: Name used for global table, note that two global tables - living in the same application cannot have the same name. - - default: A callable, or type that will return a default valu - for keys missing in this global table. - window: A windowing strategy to wrap this window in. - - Examples: - >>> gtable = app.GlobalTable('user_to_amount', default=int) - >>> gtable['George'] - 0 - >>> gtable['Elaine'] += 1 - >>> gtable['Elaine'] += 1 - >>> gtable['Elaine'] - 2 - """ - gtable = self.tables.add( - cast( - GlobalTableT, - self.conf.GlobalTable( # type: ignore - self, - name=name, - default=default, - beacon=self.tables.beacon, - partitions=partitions, - # we want to apply standby changes - # as they come min (using 1 buffer size). - standby_buffer_size=1, - is_global=True, - is_global_global=True, + is_global_global=False, help=help, **kwargs, ), From 310169fe556cb04062d844c9ea75f6dc13362a4e Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Fri, 23 Dec 2022 12:44:20 +0100 Subject: [PATCH 315/616] use use_partitioner also for having all partitions as standby --- faust/assignor/partition_assignor.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index a52966ab8..b01fdce54 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -326,11 +326,19 @@ def _global_table_standby_assignments( active_partitions = set( assignment.actives.get(changelog_topic_name, []) ) - # Only add those partitions as standby which aren't active - if not table.is_global_global: - standby_partitions = all_partitions - active_partitions - else: + + # if we use_partitioner it could happen that we write in Worker A + # to a partitions which is not active in Worker A but active in Worker B. + # To let Worker B consume the update we have to have all_partitions as + # standbys as well. + # A similar situation is happening if Global tables are shared over + # multiple consumer groups. Consumer group A could write to the table + # and consumer group B, C, D only consuming. With the global_global flag it's + # possible to have shared state over multiple consumer groups. + if table.is_global_global or self.table.use_partitioner: standby_partitions = all_partitions + else: # Only add those partitions as standby which aren't active + standby_partitions = all_partitions - active_partitions assignment.standbys[changelog_topic_name] = list(standby_partitions) # We add all_partitions as active so they are recovered # in the beginning. From 242f31cf3307b01105bc553137110362492b5c44 Mon Sep 17 00:00:00 2001 From: Alexander Oberegger Date: Wed, 4 Jan 2023 12:39:36 +0100 Subject: [PATCH 316/616] adjust to upstream changes --- faust/app/base.py | 2 +- faust/assignor/partition_assignor.py | 22 +++++++++++++--------- faust/tables/base.py | 6 ++++-- tests/unit/stores/test_rocksdb.py | 8 ++++---- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/faust/app/base.py b/faust/app/base.py index a4ed0f187..ff6c32b8f 100644 --- a/faust/app/base.py +++ b/faust/app/base.py @@ -1220,7 +1220,7 @@ def GlobalTable( # as they come min (using 1 buffer size). standby_buffer_size=1, is_global=True, - is_global_global=False, + synchronize_all_active_partitions=False, help=help, **kwargs, ), diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index b01fdce54..1655f8bec 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -328,16 +328,20 @@ def _global_table_standby_assignments( ) # if we use_partitioner it could happen that we write in Worker A - # to a partitions which is not active in Worker A but active in Worker B. - # To let Worker B consume the update we have to have all_partitions as - # standbys as well. - # A similar situation is happening if Global tables are shared over - # multiple consumer groups. Consumer group A could write to the table - # and consumer group B, C, D only consuming. With the global_global flag it's - # possible to have shared state over multiple consumer groups. - if table.is_global_global or self.table.use_partitioner: + # to a partitions which is not active in Worker A but active in + # Worker B. To let Worker B consume the update we have to have + # all_partitions as standbys as well. + # A similar situation is happening if Global tables are shared + # over multiple consumer groups. Consumer group A could write to + # the table and consumer group B, C, D only consuming. With the + # synchronize_all_active_partitions flag it's possible to have + # shared state over multiple consumer groups. + if ( + table.synchronize_all_active_partitions + or self.table.use_partitioner + ): standby_partitions = all_partitions - else: # Only add those partitions as standby which aren't active + else: # Only add those partitions as standby which aren't active standby_partitions = all_partitions - active_partitions assignment.standbys[changelog_topic_name] = list(standby_partitions) # We add all_partitions as active so they are recovered diff --git a/faust/tables/base.py b/faust/tables/base.py index f5955a582..45f7f63f2 100644 --- a/faust/tables/base.py +++ b/faust/tables/base.py @@ -122,7 +122,7 @@ def __init__( use_partitioner: bool = False, on_window_close: Optional[WindowCloseCallback] = None, is_global: bool = False, - is_global_global: bool = False, + synchronize_all_active_partitions: bool = False, **kwargs: Any, ) -> None: Service.__init__(self, **kwargs) @@ -145,7 +145,9 @@ def __init__( self._on_window_close = on_window_close self.last_closed_window = 0.0 self.is_global = is_global - self.is_global_global = is_global_global + self.synchronize_all_active_partitions = synchronize_all_active_partitions + if self.synchronize_all_active_partitions: + assert self.is_global assert self.recovery_buffer_size > 0 and self.standby_buffer_size > 0 self.options = options diff --git a/tests/unit/stores/test_rocksdb.py b/tests/unit/stores/test_rocksdb.py index f51ed3085..a10cd0816 100644 --- a/tests/unit/stores/test_rocksdb.py +++ b/tests/unit/stores/test_rocksdb.py @@ -280,7 +280,7 @@ def test__get__has_event(self, *, store, current_event): db.get.return_value = b"value" store.table = Mock(name="table") store.table.is_global = False - store.table.is_global_global = False + store.table.synchronize_all_active_partitions = False store.table.use_partitioner = False assert store._get(b"key") == b"value" @@ -313,7 +313,7 @@ def test__get__has_event_value_diff_partition(self, *, store, current_event): store.table = Mock(name="table") store.table.is_global = False - store.table.is_global_global = False + store.table.synchronize_all_active_partitions = False store.table.use_partitioner = False # A _get call from a stream, to a non-global, non-partitioner, table @@ -322,7 +322,7 @@ def test__get__has_event_value_diff_partition(self, *, store, current_event): assert store._get(b"key") is None store.table.is_global = True - store.table.is_global_global = True + store.table.synchronize_all_active_partitions = True store.table.use_partitioner = False # A global table ignores the event partition and pulls from the proper db @@ -614,7 +614,7 @@ def test__dbs_for_actives(self, *, store, table): # Global Global Table table.is_global = True - table.is_global_global = True + table.synchronize_all_active_partitions = True assert list(store._dbs_for_actives()) == [dbs[1], dbs[2], dbs[3]] def test__size(self, *, store): From fab14ecba2d0ef1f0e1b34a63d2ca4dc289be98d Mon Sep 17 00:00:00 2001 From: Marco Julian Moser Date: Tue, 10 Jan 2023 13:35:11 +0100 Subject: [PATCH 317/616] fixed synchornized_all_active_partitions attribute handling in GlobalTable Constructor --- faust/app/base.py | 3 ++- faust/assignor/partition_assignor.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/app/base.py b/faust/app/base.py index ff6c32b8f..0a5bb310c 100644 --- a/faust/app/base.py +++ b/faust/app/base.py @@ -1186,6 +1186,7 @@ def GlobalTable( window: Optional[WindowT] = None, partitions: Optional[int] = None, help: Optional[str] = None, + synchronize_all_active_partitions: Optional[bool] = False, **kwargs: Any, ) -> GlobalTableT: """Define new global table. @@ -1220,7 +1221,7 @@ def GlobalTable( # as they come min (using 1 buffer size). standby_buffer_size=1, is_global=True, - synchronize_all_active_partitions=False, + synchronize_all_active_partitions=synchronize_all_active_partitions, help=help, **kwargs, ), diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index 1655f8bec..8cdab9bef 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -326,7 +326,7 @@ def _global_table_standby_assignments( active_partitions = set( assignment.actives.get(changelog_topic_name, []) ) - + # if we use_partitioner it could happen that we write in Worker A # to a partitions which is not active in Worker A but active in # Worker B. To let Worker B consume the update we have to have From 92d7a233c0d5b265abce5a63fc35263c2b720320 Mon Sep 17 00:00:00 2001 From: Moser Marco Julian Date: Wed, 25 Jan 2023 11:32:15 +0100 Subject: [PATCH 318/616] changed self.table to table --- faust/assignor/partition_assignor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/assignor/partition_assignor.py b/faust/assignor/partition_assignor.py index 8cdab9bef..fb8ac4fb6 100644 --- a/faust/assignor/partition_assignor.py +++ b/faust/assignor/partition_assignor.py @@ -338,7 +338,7 @@ def _global_table_standby_assignments( # shared state over multiple consumer groups. if ( table.synchronize_all_active_partitions - or self.table.use_partitioner + or table.use_partitioner ): standby_partitions = all_partitions else: # Only add those partitions as standby which aren't active From 3785ef529c97a01276b05e732410b4e00304daa9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 6 Feb 2023 08:02:07 +0100 Subject: [PATCH 319/616] added additional log --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e99ed3072..2fbfd6425 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -171,6 +171,10 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): return start = time.time() + self.log.info( + "BigTableStore: Start fill for table" + f"{self.bt_table.name}:{preload_ids_todo}" + ) if self._value_cache is not None: row_set, row_filter = self._get_preload_rowset_and_filter( From 2415bff6d7bc5afd9d6cb0ed952625e88a1dfef9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Feb 2023 10:08:23 +0100 Subject: [PATCH 320/616] removed crash from app --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2fbfd6425..fdeaaf545 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -748,10 +748,9 @@ def set_persisted_offset( except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" - " -> will crash faust app! " + " -> will cause additional changelogs if restart happens" f"TRACEBACK: {traceback.format_exc()}" ) - self.app._crash(e) def _persist_changelog_batch(self, row_mutations, tp_offsets): response = self.bt_table.mutate_rows(row_mutations) From b90a6fc03fe03aa1d4197d33ac6c3dc0cd56a804 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Feb 2023 10:21:45 +0100 Subject: [PATCH 321/616] added log for iterkeys --- faust/stores/bigtable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fdeaaf545..1c3c71691 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -615,6 +615,9 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() partitions = self._active_partitions() + self.log.info( + f"Start iterkeys for {self.table_name}" + ) row_set = BT.RowSet() for partition in partitions: prefix_start = self._get_partition_prefix(partition) From 9ce63f21b3499da1bc1c6a5e39646563a5c9e041 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 3 Mar 2023 10:12:26 +0100 Subject: [PATCH 322/616] added log to check if rebalancing happens --- faust/stores/rocksdb.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/rocksdb.py b/faust/stores/rocksdb.py index e7150d120..2fadc9648 100644 --- a/faust/stores/rocksdb.py +++ b/faust/stores/rocksdb.py @@ -350,6 +350,7 @@ async def on_rebalance( """ self.rebalance_ack = False async with self.db_lock: + self.logger.error("AAAAH, we do some rebalancing") self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) From bac0e9281870e0c70cb57fce8307f45c5ed0d34d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Mar 2023 14:57:29 +0100 Subject: [PATCH 323/616] added on rebalance to faust bigtable-store --- faust/stores/bigtable.py | 144 ++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 62 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1c3c71691..548699a57 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,19 +1,12 @@ """BigTable storage.""" +import asyncio +import gc import logging import time import traceback from collections import deque -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - Optional, - Set, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union + from google.cloud.bigtable.row_filters import ( RowFilterChain, RowFilterUnion, @@ -98,9 +91,7 @@ def _maybe_ttl_clear(self): now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} - self.log.info( - "BigTableStore: Cleard startupcache because TTL is over" - ) + self.log.info("BigTableStore: Cleard startupcache because TTL is over") self.ttl_over = True def keys(self): @@ -155,9 +146,7 @@ def _get_preload_rowset_and_filter(self, preload_ids): if len(filters) > 1: row_filter = RowFilterUnion(filters=filters) elif len(filters) == 1: - row_filter = RowFilterChain( - [CellsColumnLimitFilter(1), filters[0]] - ) + row_filter = RowFilterChain([CellsColumnLimitFilter(1), filters[0]]) else: row_filter = CellsColumnLimitFilter(1) return row_set, row_filter @@ -177,12 +166,8 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): ) if self._value_cache is not None: - row_set, row_filter = self._get_preload_rowset_and_filter( - preload_ids_todo - ) - for row in self.bt_table.read_rows( - row_set=row_set, filter_=row_filter - ): + row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) + for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): if row.row_key in self._mutations.keys(): mutation_val = self._mutations[row.row_key][1] if mutation_val is not None: @@ -290,9 +275,7 @@ def _init_value_cache( enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) if enable: # TODO Maybe we need to remove invalidation time and size - ttl = options.get( - BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 - ) + ttl = options.get(BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) else: @@ -301,12 +284,8 @@ def _init_value_cache( def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) # To prevent that all tables write at the same time - self._last_flush = ( - {} - ) # time.time() + self._mut_freq - random_start_offset - self._max_mutations = options.get( - BigTableStore.BT_MAX_MUTATIONS, 10000 - ) + self._last_flush = {} # time.time() + self._mut_freq - random_start_offset + self._max_mutations = options.get(BigTableStore.BT_MAX_MUTATIONS, 10000) self._mutations = {} @@ -318,6 +297,7 @@ class BigTableStore(base.SerializedStore): bt_table: BT.Table _cache: BigTableCacheManager partition_prefix = b"__" + _db_lock: asyncio.Lock BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -349,14 +329,14 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) + self.db_lock = asyncio.Lock() + self.rebalance_ack = False def _set_options(self, options) -> None: self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) - self.column_name = options.get( - BigTableStore.BT_COLUMN_NAME_KEY, "DATA" - ) + self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" @@ -380,9 +360,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - self.column_family_id: BT.column_family.MaxVersionsGCRule( - 1 - ) + self.column_family_id: BT.column_family.MaxVersionsGCRule(1) } ) else: @@ -456,9 +434,7 @@ def _bigtable_get_range( # Not found return None, None - def _bigtable_set( - self, key: bytes, value: Optional[bytes], persist_offset=False - ): + def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): if not persist_offset: # All mutatations set here will be flushed to BT later self._cache.set(key, value) @@ -493,9 +469,7 @@ def _get_partition_prefix(self, partition: int) -> bytes: return b"".join([partition_bytes, self.partition_prefix]) def _remove_partition_prefix(self, key: bytes) -> bytes: - slice_from = key.find(self.partition_prefix) + len( - self.partition_prefix - ) + slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) return key[slice_from:] def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: @@ -536,9 +510,7 @@ def _get(self, key: bytes) -> Optional[bytes]: return value return None except KeyError as ke: - self.log.error( - f"KeyError in get for table {self.table_name} for {key=}" - ) + self.log.error(f"KeyError in get for table {self.table_name} for {key=}") raise ke except Exception as ex: self.log.error( @@ -549,9 +521,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_key_with_partition( - key, partition=partition - ) + key_with_partition = self._get_key_with_partition(key, partition=partition) self._bigtable_set(key_with_partition, value) self._cache.set_partition(key, partition) except Exception as ex: @@ -615,9 +585,7 @@ def _iterkeys(self) -> Iterator[bytes]: start = time.time() partitions = self._active_partitions() - self.log.info( - f"Start iterkeys for {self.table_name}" - ) + self.log.info(f"Start iterkeys for {self.table_name}") row_set = BT.RowSet() for partition in partitions: prefix_start = self._get_partition_prefix(partition) @@ -647,9 +615,7 @@ def _iterkeys(self) -> Iterator[bytes]: yield key end = time.time() - self.log.info( - f"Finished iterkeys for {self.table_name} in {end - start}s" - ) + self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -731,9 +697,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return offset return None - def set_persisted_offset( - self, tp: TP, offset: int, recovery=False - ) -> None: + def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: """Set the last persisted offset for this table. This will remember the last offset that we wrote to BigTableStore, @@ -787,9 +751,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_key_with_partition( - msg.key, partition=tp.partition - ) + offset_key = self._get_key_with_partition(msg.key, partition=tp.partition) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() @@ -828,3 +790,61 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") + + + def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: + """De-assign partitions used on this worker instance. + + Arguments: + table: The table that we store data for. + tps: Set of topic partitions that we should no longer + be serving data for. + """ + for tp in tps: + if tp.topic in table.changelog_topic.topics: + db = self._dbs.pop(tp.partition, None) + if db is not None: + self.logger.info(f"closing db {tp.topic} partition {tp.partition}") + # db.close() + gc.collect() + + async def assign_partitions( + self, table: CollectionT, tps: Set[TP], generation_id: int = 0 + ) -> None: + """Assign partitions to this worker instance. + + Arguments: + table: The table that we store data for. + tps: Set of topic partitions we have been assigned. + """ + self.rebalance_ack = True + standby_tps = self.app.assignor.assigned_standbys() + my_topics = table.changelog_topic.topics + for tp in tps: + if tp.topic in my_topics and tp not in standby_tps and self.rebalance_ack: + await self._try_open_db_for_partition( + tp.partition, generation_id=generation_id + ) + await asyncio.sleep(0) + + async def on_rebalance( + self, + assigned: Set[TP], + revoked: Set[TP], + newly_assigned: Set[TP], + generation_id: int = 0, + ) -> None: + """Rebalance occurred. + + Arguments: + assigned: Set of all assigned topic partitions. + revoked: Set of newly revoked topic partitions. + newly_assigned: Set of newly assigned topic partitions, + for which we were not assigned the last time. + generation_id: the metadata generation identifier for the re-balance + """ + self.rebalance_ack = False + async with self._db_lock: + self.logger.info(f"Rebalancing {revoked=}, {newly_assigned=}") + self.revoke_partitions(self.table, revoked) + await self.assign_partitions(self.table, newly_assigned, generation_id) From 076c6262406aa81d6a7dd07d9a8c435cddd78087 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Mar 2023 15:03:36 +0100 Subject: [PATCH 324/616] removed logging from rocksb --- faust/stores/rocksdb.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/rocksdb.py b/faust/stores/rocksdb.py index 2fadc9648..e7150d120 100644 --- a/faust/stores/rocksdb.py +++ b/faust/stores/rocksdb.py @@ -350,7 +350,6 @@ async def on_rebalance( """ self.rebalance_ack = False async with self.db_lock: - self.logger.error("AAAAH, we do some rebalancing") self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) From 5e70564e2ad16e3f25bd697fb20c0ce6f518b946 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Mar 2023 15:29:39 +0100 Subject: [PATCH 325/616] fixed typo with db_lock in bitable --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 548699a57..6188f8210 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -329,7 +329,7 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) - self.db_lock = asyncio.Lock() + self._db_lock = asyncio.Lock() self.rebalance_ack = False def _set_options(self, options) -> None: From d3062def1b531b3a5d8acee7c3f593d0e0111f42 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 7 Mar 2023 17:05:10 +0100 Subject: [PATCH 326/616] different aproach on rebalance --- faust/stores/bigtable.py | 54 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6188f8210..82fccb1c3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -269,6 +269,15 @@ def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): ) self._mutations[bt_key] = row, value + def delete_partition(self, partition: int): + if self._value_cache is not None: + keys = set(self._value_cache.keys()) + for k in keys: + if k[0] == partition: + del self._value_cache[k] + self._mutations.pop(k, None) + self._partition_cache.pop(k[1:], None) + def _init_value_cache( self, options ) -> Optional[Union[LRUCache, BigTableValueCache]]: @@ -333,6 +342,7 @@ def __init__( self.rebalance_ack = False def _set_options(self, options) -> None: + self._all_options = options self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) @@ -791,8 +801,7 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - - def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: + def revoke_partitions(self, tps: Set[TP]) -> None: """De-assign partitions used on this worker instance. Arguments: @@ -801,31 +810,9 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: be serving data for. """ for tp in tps: - if tp.topic in table.changelog_topic.topics: - db = self._dbs.pop(tp.partition, None) - if db is not None: - self.logger.info(f"closing db {tp.topic} partition {tp.partition}") - # db.close() + self._cache.delete_partition(tp.partition) gc.collect() - async def assign_partitions( - self, table: CollectionT, tps: Set[TP], generation_id: int = 0 - ) -> None: - """Assign partitions to this worker instance. - - Arguments: - table: The table that we store data for. - tps: Set of topic partitions we have been assigned. - """ - self.rebalance_ack = True - standby_tps = self.app.assignor.assigned_standbys() - my_topics = table.changelog_topic.topics - for tp in tps: - if tp.topic in my_topics and tp not in standby_tps and self.rebalance_ack: - await self._try_open_db_for_partition( - tp.partition, generation_id=generation_id - ) - await asyncio.sleep(0) async def on_rebalance( self, @@ -845,6 +832,19 @@ async def on_rebalance( """ self.rebalance_ack = False async with self._db_lock: - self.logger.info(f"Rebalancing {revoked=}, {newly_assigned=}") - self.revoke_partitions(self.table, revoked) + self.logger.info( + f"BigTableStore: Rebalancing {revoked=}, {newly_assigned=}" + ) + self.revoke_partitions(revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) + + async def assign_partitions( + self, table: CollectionT, tps: Set[TP], generation_id: int = 0 + ) -> None: + """Assign partitions to this worker instance. + + Arguments: + table: The table that we store data for. + tps: Set of topic partitions we have been assigned. + """ + self.rebalance_ack = True From 500e1dd180dcb3329c03a75c5f0f396a09896b0c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 8 Mar 2023 09:51:14 +0100 Subject: [PATCH 327/616] added tests for on rebalance and deleting partitions from cache --- tests/unit/stores/test_bigtable.py | 234 +++++++++++------------------ 1 file changed, 90 insertions(+), 144 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index f58dcf30e..64f8cf249 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,8 +1,8 @@ import time from unittest.mock import MagicMock, call, patch -from mode.utils.collections import LRUCache import pytest +from mode.utils.collections import LRUCache import faust from faust.stores.bigtable import ( @@ -25,9 +25,7 @@ class RowSetMock: def __init__(self) -> None: self.keys = set() self.add_row_key = MagicMock(wraps=self._add_row_key) - self.add_row_range_from_keys = MagicMock( - wraps=self._add_row_range_from_keys - ) + self.add_row_range_from_keys = MagicMock(wraps=self._add_row_range_from_keys) def _add_row_key(self, key): self.keys.add(key) @@ -176,9 +174,7 @@ def test_iscomplete__init__(self): BigTableStore.VALUE_CACHE_ENABLE_KEY: True, } - test_manager = BigTableCacheManager( - MagicMock(), options, bigtable_mock - ) + test_manager = BigTableCacheManager(MagicMock(), options, bigtable_mock) assert test_manager.bt_table == bigtable_mock assert isinstance(test_manager._value_cache, BigTableValueCache) assert test_manager._mut_freq == 0 @@ -190,18 +186,14 @@ def test_iscomplete__init__(self): def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock( - return_value="a_rule" - ) + bt.column_family.MaxVersionsGCRule = MagicMock(return_value="a_rule") bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt @pytest.fixture() def manager(self, bt_imports): with patch("faust.stores.bigtable.BT", bt_imports): - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): + with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): bigtable_mock = BigTableMock() app_mock = MagicMock() app_mock.conf = MagicMock() @@ -211,9 +203,7 @@ def manager(self, bt_imports): BigTableStore.VALUE_CACHE_ENABLE_KEY: True, BigTableStore.BT_MUTATION_FREQ_KEY: 600, } - manager = BigTableCacheManager( - MagicMock(), options, bigtable_mock - ) + manager = BigTableCacheManager(MagicMock(), options, bigtable_mock) manager._partition_cache = {} return manager @@ -277,9 +267,10 @@ def test_get(self, manager): manager._fill_if_empty.assert_called_with({key_not_in}) assert res is None + manager._fill_if_empty.reset_mock() manager._value_cache = None res = manager.get(key_in) - manager._fill_if_empty.assert_called_with({key_in}) + manager._fill_if_empty.assert_not_called() assert res is None def test_set(self, manager): @@ -332,16 +323,17 @@ def test_contains(self, manager): manager._fill_if_empty.assert_called_with({key_not_in}) manager._value_cache = None - assert manager.contains(key_in) is False - manager._fill_if_empty.assert_called_with({key_in}) - assert manager.contains(key_not_in) is False - manager._fill_if_empty.assert_called_with({key_not_in}) + manager._fill_if_empty.reset_mock() + assert manager.contains(key_in) is None + manager._fill_if_empty.assert_not_called() + assert manager.contains(key_not_in) is None + manager._fill_if_empty.assert_not_called() manager._mutations = {key_in: (key_in, key_in)} assert manager.contains(key_in) is True - manager._fill_if_empty.assert_called_with({key_in}) - assert manager.contains(key_not_in) is False - manager._fill_if_empty.assert_called_with({key_not_in}) + manager._fill_if_empty.assert_not_called() + assert manager.contains(key_not_in) is None + manager._fill_if_empty.assert_not_called() def test_contains_any(self, manager): # Adding the key here is sufficient, because the cache gets filled @@ -356,34 +348,29 @@ def test_contains_any(self, manager): manager._fill_if_empty.assert_called_with({key_not_in}) manager._value_cache = None - assert manager.contains_any({key_in, key_not_in}) is False - manager._fill_if_empty.assert_called_with({key_in, key_not_in}) - assert manager.contains_any({key_not_in}) is False - manager._fill_if_empty.assert_called_with({key_not_in}) + manager._fill_if_empty.reset_mock() + assert manager.contains_any({key_in, key_not_in}) is None + manager._fill_if_empty.assert_not_called() + assert manager.contains_any({key_not_in}) is None + manager._fill_if_empty.assert_not_called() manager._mutations = {key_in: (key_in, key_in)} assert manager.contains_any({key_in, key_not_in}) is True - manager._fill_if_empty.assert_called_with({key_in, key_not_in}) - assert manager.contains_any({key_not_in}) is False - manager._fill_if_empty.assert_called_with({key_not_in}) + manager._fill_if_empty.assert_not_called() + assert manager.contains_any({key_not_in}) is None + manager._fill_if_empty.assert_not_called() def test_flush_if_timer_over(self, manager): tp = TP("a_topic", partition=19) tp2 = TP("a_topic", partition=0) time.time = MagicMock(return_value=0) - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(404)] - ) + manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(404)]) row_mock = MagicMock() row_mock.row_key = b"\x13AAA" - manager._mutations = { - row_mock.row_key: (row_mock, "some_row_mutation") - } + manager._mutations = {row_mock.row_key: (row_mock, "some_row_mutation")} - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): + with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): assert manager.flush_if_timer_over(tp) is True assert manager._last_flush == {tp.partition: 0} assert manager.flush_if_timer_over(tp) is False @@ -407,9 +394,7 @@ def test_flush_if_timer_over(self, manager): assert manager.flush_if_timer_over(tp) is False manager._last_flush = {} - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] - ) + manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)]) with patch( "faust.stores.bigtable.time.time", @@ -423,17 +408,11 @@ def test_flush_if_timer_over_on_max_count(self, manager): tp = TP("a_topic", partition=19) row_mock = MagicMock() row_mock.row_key = b"\x13AAA" - manager._mutations = { - row_mock.row_key: (row_mock, "some_row_mutation") - } + manager._mutations = {row_mock.row_key: (row_mock, "some_row_mutation")} manager._max_mutations = 1 manager._last_flush = {tp.partition: 999999999999} - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] - ) - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): + manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)]) + with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): assert manager.flush_if_timer_over(tp) is True def test_set_mutation(self, manager): @@ -458,25 +437,6 @@ def test_set_mutation(self, manager): assert manager._mutations[row_mock.row_key][1] is None assert len(manager._mutations) == 1 - # def test_iterkeys(self, manager): - # key_in = b"\x13AAA" - # manager.bt_table.add_test_data({key_in}) - # manager._fill_if_empty_and_yield = MagicMock( - # wraps=manager._fill_if_empty_and_yield - # ) - - # res = list(manager.iterkeys()) - # manager._fill_if_empty_and_yield.assert_not_called() - # assert res == [] # cache should not be filled yet - - # res = list(manager.iterkeys({key_in})) - # manager._fill_if_empty_and_yield.assert_called_once_with({key_in}) - # assert key_in in res - - # res = list(manager.iterkeys()) - # assert manager._fill_if_empty_and_yield.call_count == 1 - # assert key_in in res - def test_fill_if_empty_and_yield(self, manager): manager.bt_table.add_test_data({b"\x13AAA"}) @@ -490,6 +450,27 @@ def test_fill_if_empty_and_yield(self, manager): assert res == [] manager.bt_table.read_rows.assert_not_called() + def test_delete_partition(self, manager): + partition = 19 + row_mock = MagicMock() + row_mock.delete = MagicMock() + row_mock.set_cell = MagicMock() + row_mock.row_key = b"\x13AAA" + manager.bt_table.direct_row = MagicMock(return_value=row_mock) + manager.bt_table.add_test_data({b"\x13AAA"}) + manager.set(row_mock.row_key, row_mock) + manager.set_partition(row_mock.row_key[1:], partition) + manager.delete_partition(3) + assert len(manager._value_cache) == 1 + assert len(manager._mutations) == 1 + assert len(manager._partition_cache) == 1 + manager.delete_partition(partition) + assert len(manager._value_cache) == 0 + assert len(manager._mutations) == 0 + assert len(manager._partition_cache) == 0 + # Delete something that does not exist yet should not do anything + manager.delete_partition(999999) + class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" @@ -500,9 +481,7 @@ class TestBigTableStore: def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock( - return_value="a_rule" - ) + bt.column_family.MaxVersionsGCRule = MagicMock(return_value="a_rule") bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt @@ -544,9 +523,7 @@ def table_name_gen(table): return table.name[::-1] self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) + self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) client_mock = MagicMock() instance_mock = MagicMock() @@ -580,9 +557,7 @@ def table_name_gen(table): # Test with no existing table self_mock.reset_mock() self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) + self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) table_mock.exists = MagicMock(return_value=False) return_value = BigTableStore._bigtable_setup( self_mock, faust_table_mock, options @@ -613,9 +588,7 @@ def test_bigtable_bigtable_get_on_empty(self, store): return_value = store._bigtable_get(self.TEST_KEY1) store._cache.contains.assert_called_with(self.TEST_KEY1) store._cache.get.assert_not_called() - store.bt_table.read_row.assert_called_with( - self.TEST_KEY1, filter_="a_filter" - ) + store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") assert return_value is None def test_bigtable_bigtable_get_cache_miss(self, store): @@ -662,9 +635,7 @@ def test_bigtable_get_range_cache_miss(self, store): def test_bigtable_get_range_cache_hit(self, store): store._cache.get = MagicMock(return_value="cache_res") - result_value = store._bigtable_get_range( - [self.TEST_KEY1, self.TEST_KEY3] - ) + result_value = store._bigtable_get_range([self.TEST_KEY1, self.TEST_KEY3]) store.bt_table.read_rows.assert_not_called assert result_value == (self.TEST_KEY1, "cache_res") @@ -674,17 +645,13 @@ def test_bigtable_contains(self, store): store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY1, filter_="a_filter" - ) + store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") store._cache.delete.assert_not_called() assert return_value is True return_value = store._bigtable_contains(self.TEST_KEY2) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY2, filter_="a_filter" - ) - store._cache.delete.assert_called_with(self.TEST_KEY2) + store.bt_table.read_row.assert_called_with(self.TEST_KEY2, filter_="a_filter") + store._cache.delete.assert_not_called() store._cache.delete.reset_mock() store.bt_table.read_row.reset_mock() @@ -747,14 +714,10 @@ def test_bigtable_set(self, store): store._cache.set = MagicMock(return_value=None) store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, persist_offset=True - ) + store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1, persist_offset=True) store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) - store._cache.set.assert_called_once_with( - self.TEST_KEY1, self.TEST_KEY1 - ) + store._cache.set.assert_called_once_with(self.TEST_KEY1, self.TEST_KEY1) row_mock.set_cell.assert_called_once_with( store.column_family_id, store.column_name, @@ -822,29 +785,21 @@ def test_partitions_for_key(self, store): def test_get_with_known_partition(self, store): partition = 19 - store._maybe_get_partition_from_message = MagicMock( - return_value=partition - ) + store._maybe_get_partition_from_message = MagicMock(return_value=partition) store._cache.set_partition = MagicMock() # Scenario: Found store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) - key_with_partition = store._get_key_with_partition( - self.TEST_KEY1, partition - ) + key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) store._bigtable_get.assert_called_once_with(key_with_partition) - store._cache.set_partition.assert_called_once_with( - self.TEST_KEY1, partition - ) + store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) assert res == b"a_value" store._cache.set_partition.reset_mock() # Scenario: Not Found store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY1) - key_with_partition = store._get_key_with_partition( - self.TEST_KEY1, partition - ) + key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) store._bigtable_get.assert_called_once_with(key_with_partition) store._cache.set_partition.assert_not_called() assert res is None @@ -860,9 +815,7 @@ def test_get_with_unknown_partition(self, store): # Scenario: Found key_of_value = store._get_key_with_partition(self.TEST_KEY1, 19) - store._bigtable_get_range = MagicMock( - return_value=(key_of_value, b"a_value") - ) + store._bigtable_get_range = MagicMock(return_value=(key_of_value, b"a_value")) res = store._get(self.TEST_KEY1) store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) store._bigtable_get_range.assert_called_once_with(keys_searched) @@ -879,21 +832,13 @@ def test_get_with_unknown_partition(self, store): def test_set(self, store): partition = 19 - faust.stores.bigtable.get_current_partition = MagicMock( - return_value=partition - ) + faust.stores.bigtable.get_current_partition = MagicMock(return_value=partition) store._bigtable_set = MagicMock() store._cache.set_partition = MagicMock() store._set(self.TEST_KEY1, b"a_value") - key_with_partition = store._get_key_with_partition( - self.TEST_KEY1, partition - ) - store._bigtable_set.assert_called_once_with( - key_with_partition, b"a_value" - ) - store._cache.set_partition.assert_called_once_with( - self.TEST_KEY1, partition - ) + key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) + store._bigtable_set.assert_called_once_with(key_with_partition, b"a_value") + store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) def test_del(self, store): store._cache._partition_cache = {self.TEST_KEY1: 19} @@ -913,9 +858,7 @@ def test_active_partitions(self, store): TP("a_changelogtopic", 19), TP("a_different_chaneglogtopic", 19), ] - store.app.assignor.assigned_actives = MagicMock( - return_value=active_topics - ) + store.app.assignor.assigned_actives = MagicMock(return_value=active_topics) store.app.conf.topic_partitions = 20 store.table.changelog_topic_name = "a_changelogtopic" store.table.is_global = False @@ -947,6 +890,7 @@ def test_iteritems(self, store): def test_iterkeys(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) + store._cache._partition_cache.limit = 3 keys_in_store = [] keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) @@ -1073,9 +1017,7 @@ def test_set_persisted_offset(self, store): def test_persist_changelog_batch(self, store): # Scenario 1: no failure - store.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] * 10 - ) + store.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)] * 10) store.log = MagicMock() store.log.error = MagicMock() store.set_persisted_offset = MagicMock() @@ -1087,12 +1029,8 @@ def test_persist_changelog_batch(self, store): tp2: 222, tp3: 333, } - store._persist_changelog_batch( - ["row1", "row2", "etc..."], offset_batch - ) - store.bt_table.mutate_rows.assert_called_with( - ["row1", "row2", "etc..."] - ) + store._persist_changelog_batch(["row1", "row2", "etc..."], offset_batch) + store.bt_table.mutate_rows.assert_called_with(["row1", "row2", "etc..."]) assert store.set_persisted_offset.call_count == len(offset_batch) store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) @@ -1101,12 +1039,8 @@ def test_persist_changelog_batch(self, store): # Scenario 2: all failure store.set_persisted_offset.reset_mock() store.bt_table.mutate_rows.reset_mock() - store.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(404)] - ) - store._persist_changelog_batch( - ["row1", "row2", "etc..."], offset_batch - ) + store.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(404)]) + store._persist_changelog_batch(["row1", "row2", "etc..."], offset_batch) # FIXME: I'm not sure if we want that behaviour. # Question: What should happen on a failed mutated row in recovery. store.set_persisted_offset.assert_called() @@ -1147,3 +1081,15 @@ def __init__(self, message): store._persist_changelog_batch.assert_called_once() tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] assert tp_offsets == {tp: 3, tp2: 4} + + + def test_revoke_partitions(self, store): + store._cache.delete_partition = MagicMock() + TP1 = MagicMock() + TP1.partition = 1 + TP2 = MagicMock() + TP2.partition = 2 + + store.revoke_partitions({TP1, TP2}) + store._cache.delete_partition.assert_any_call(1) + store._cache.delete_partition.assert_any_call(2) From 6a34b925e07056797f49e974adb764963c4e3a17 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 8 Mar 2023 10:13:49 +0100 Subject: [PATCH 328/616] removed new line --- tests/unit/stores/test_bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 64f8cf249..3309ba5e8 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1082,7 +1082,6 @@ def __init__(self, message): tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] assert tp_offsets == {tp: 3, tp2: 4} - def test_revoke_partitions(self, store): store._cache.delete_partition = MagicMock() TP1 = MagicMock() From 94f86dbecb58c77ade6c5414147245c0d3c7f2a7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 8 Mar 2023 10:24:53 +0100 Subject: [PATCH 329/616] removed assign partition, as this is not needed for bigtable --- faust/stores/bigtable.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 82fccb1c3..3923497eb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -830,21 +830,8 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - self.rebalance_ack = False async with self._db_lock: self.logger.info( f"BigTableStore: Rebalancing {revoked=}, {newly_assigned=}" ) self.revoke_partitions(revoked) - await self.assign_partitions(self.table, newly_assigned, generation_id) - - async def assign_partitions( - self, table: CollectionT, tps: Set[TP], generation_id: int = 0 - ) -> None: - """Assign partitions to this worker instance. - - Arguments: - table: The table that we store data for. - tps: Set of topic partitions we have been assigned. - """ - self.rebalance_ack = True From 33cf7e7728cff370ec12b06da6d54b808c4c11ef Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 13 Mar 2023 18:05:44 +0100 Subject: [PATCH 330/616] added possibility for custom key translators --- faust/stores/bigtable.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3923497eb..1c26698b6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -91,7 +91,6 @@ def _maybe_ttl_clear(self): now = int(time.time()) if now > self.init_ts + self.ttl: self.data = {} - self.log.info("BigTableStore: Cleard startupcache because TTL is over") self.ttl_over = True def keys(self): @@ -160,11 +159,6 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): return start = time.time() - self.log.info( - "BigTableStore: Start fill for table" - f"{self.bt_table.name}:{preload_ids_todo}" - ) - if self._value_cache is not None: row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): @@ -315,10 +309,10 @@ class BigTableStore(base.SerializedStore): BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" BT_MUTATION_FREQ_KEY = "bt_mutation_freq_key" BT_MAX_MUTATIONS = "bt_max_mutations" + BT_CUSTOM_KEY_TRANSLATOR_KEY = "bt_custom_key_translator" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" - CUSTOM_CACHE_PARTITIONING_KEY = "custom_cache_partitioning_key" CACHE_PRELOAD_SUFFIX_LEN_KEY = "cache_preload_suffix_len_key" CACHE_PRELOAD_PREFIX_LEN_KEY = "cache_preload_prefix_len_key" @@ -341,7 +335,16 @@ def __init__( self._db_lock = asyncio.Lock() self.rebalance_ack = False + def default_translator(self, user_key): + return user_key + def _set_options(self, options) -> None: + self._transform_key_to_bt = options.get( + BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY[0], self.default_translator + ) + self._transform_key_from_bt = options.get( + BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY[1], self.default_translator + ) self._all_options = options self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name @@ -480,9 +483,11 @@ def _get_partition_prefix(self, partition: int) -> bytes: def _remove_partition_prefix(self, key: bytes) -> bytes: slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) - return key[slice_from:] + key = key[slice_from:] + return self._transform_key_from_bt(key) def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: + key = self._transform_key_to_bt(key) prefix = self._get_partition_prefix(partition) key = b"".join([prefix, key]) return key From c866363dc75465b97d8fab68b399e9570fea51e7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 09:51:42 +0100 Subject: [PATCH 331/616] removed option for partition prefix. now partition is just prepended --- faust/stores/bigtable.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1c26698b6..0ede0b365 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -299,7 +299,6 @@ class BigTableStore(base.SerializedStore): instance: BT.Instance bt_table: BT.Table _cache: BigTableCacheManager - partition_prefix = b"__" _db_lock: asyncio.Lock BT_COLUMN_NAME_KEY = "bt_column_name_key" @@ -335,7 +334,8 @@ def __init__( self._db_lock = asyncio.Lock() self.rebalance_ack = False - def default_translator(self, user_key): + @staticmethod + def default_translator(user_key): return user_key def _set_options(self, options) -> None: @@ -479,11 +479,10 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: def _get_partition_prefix(self, partition: int) -> bytes: partition_bytes = partition.to_bytes(1, "little") - return b"".join([partition_bytes, self.partition_prefix]) + return b"".join([partition_bytes]) def _remove_partition_prefix(self, key: bytes) -> bytes: - slice_from = key.find(self.partition_prefix) + len(self.partition_prefix) - key = key[slice_from:] + key = key[1:] return self._transform_key_from_bt(key) def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: From aacf820823ece92edd3831540fba218505444cca Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 13:20:22 +0100 Subject: [PATCH 332/616] fixed tests --- tests/unit/stores/test_bigtable.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 3309ba5e8..b2a44e60a 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -213,35 +213,34 @@ def test_fill_if_empty(self, manager): # Scenario 1: Everything empty manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13_***_"} + assert manager._finished_preloads == {b"\x13"} manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13_***_"} + assert manager._finished_preloads == {b"\x13"} manager._fill_if_empty({b"\x10XXX"}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._finished_preloads == {b"\x13_***_", b"\x10_***_"} + assert manager._finished_preloads == {b"\x13", b"\x10"} assert manager.contains(key) def test_fill_if_empty_with_pre_and_suffix(self, manager): - manager.preload_prefix = 3 - manager.preload_suffix = 1 + manager.get_preload_prefix_len = lambda _: 3 key = b"\x13PPAAAAAAAA" manager.bt_table.add_test_data({key}) # Scenario 1: Everything empty manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13PP_***_A"} + assert manager._finished_preloads == {b"\x13PP"} manager._fill_if_empty({key}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13PP_***_A"} + assert manager._finished_preloads == {b"\x13PP"} manager._fill_if_empty({b"\x10XXX"}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._finished_preloads == {b"\x13PP_***_A", b"\x10XX_***_X"} + assert manager._finished_preloads == {b"\x13PP", b"\x10XX"} assert manager.contains(key) def test_fill_if_empty_with_mutation(self, manager): @@ -752,16 +751,13 @@ def test_get_partition_prefix(self, store): partition = 0 res = store._get_partition_prefix(partition) assert res[0] == partition - assert res[1:] == store.partition_prefix partition = 19 res = store._get_partition_prefix(partition) assert res[0] == partition - assert res[1:] == store.partition_prefix def test_remove_partition_prefix(self, store): - store.partition_prefix = b"abc" - key_with_partition = b"abcTHEACTUALKEY" + key_with_partition = b"\x13THEACTUALKEY" res = store._remove_partition_prefix(key_with_partition) assert res == b"THEACTUALKEY" From 06ba57e2601454d6ef5e73c8cf02341aab823da2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 13:21:08 +0100 Subject: [PATCH 333/616] removed complicated prefix loading --- faust/stores/bigtable.py | 41 ++++++++-------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0ede0b365..d9bea483a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -7,12 +7,6 @@ from collections import deque from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union -from google.cloud.bigtable.row_filters import ( - RowFilterChain, - RowFilterUnion, - RowKeyRegexFilter, -) - try: # pragma: no cover from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client @@ -105,12 +99,9 @@ class BigTableCacheManager: def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - self.preload_prefix = options.get( - BigTableStore.CACHE_PRELOAD_PREFIX_LEN_KEY, - 1, # Default partition only - ) - self.preload_suffix = options.get( - BigTableStore.CACHE_PRELOAD_SUFFIX_LEN_KEY, 0 # Default skip + self.get_preload_prefix_len = options.get( + BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY, + lambda _: 1, # Default partition only ) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) @@ -123,31 +114,16 @@ def _fill_if_empty(self, bt_keys): deque(self._fill_if_empty_and_yield(bt_keys), maxlen=0) def _preload_id_from_key(self, bt_key): - prefix = bt_key[: self.preload_prefix] - if self.preload_suffix == 0: - suffix = b"" - else: - suffix = bt_key[-self.preload_suffix :] - return b"_***_".join([prefix, suffix]) + prefix = bt_key[: self.get_preload_prefix_len(bt_key)] + return prefix def _get_preload_rowset_and_filter(self, preload_ids): row_set = BT.RowSet() - - filters = [] - suffix_in = set() + row_filter = CellsColumnLimitFilter(1) for preload_id in preload_ids: - prefix, suffix = preload_id.split(b"_***_") row_set.add_row_range_from_keys( - start_key=prefix, end_key=prefix, end_inclusive=True + start_key=preload_id, end_key=preload_id, end_inclusive=True ) - if suffix not in suffix_in: - filters.append(RowKeyRegexFilter(b"".join([b".*", suffix]))) - if len(filters) > 1: - row_filter = RowFilterUnion(filters=filters) - elif len(filters) == 1: - row_filter = RowFilterChain([CellsColumnLimitFilter(1), filters[0]]) - else: - row_filter = CellsColumnLimitFilter(1) return row_set, row_filter def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): @@ -312,8 +288,7 @@ class BigTableStore(base.SerializedStore): VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" - CACHE_PRELOAD_SUFFIX_LEN_KEY = "cache_preload_suffix_len_key" - CACHE_PRELOAD_PREFIX_LEN_KEY = "cache_preload_prefix_len_key" + CACHE_PRELOAD_PREFIX_LEN_FUN_KEY = "cache_preload_prefix_len_fun_key" def __init__( self, From 94b0771d73d9ea7ec0ab0a5e7234d3587b211ea7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 17:27:32 +0100 Subject: [PATCH 334/616] added log for failed fill --- faust/stores/bigtable.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d9bea483a..61bb05919 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -137,15 +137,19 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): start = time.time() if self._value_cache is not None: row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) - for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): - if row.row_key in self._mutations.keys(): - mutation_val = self._mutations[row.row_key][1] - if mutation_val is not None: - self._value_cache[row.row_key] = mutation_val - else: - value = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache[row.row_key] = value - yield row.row_key + try: + for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): + if row.row_key in self._mutations.keys(): + mutation_val = self._mutations[row.row_key][1] + if mutation_val is not None: + self._value_cache[row.row_key] = mutation_val + else: + value = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache[row.row_key] = value + yield row.row_key + except Exception as e: + self.log.info(f"BigTableStore fill failed for {preload_ids_todo=}") + raise e end = time.time() self.log.info( "BigTableStore: Finished fill for table" From d73b73bdb1e2b7040a00253c0af465ce450380c9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 17:29:15 +0100 Subject: [PATCH 335/616] added logging to see preload ids that fail --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 61bb05919..55f392519 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -136,8 +136,9 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): start = time.time() if self._value_cache is not None: - row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) + import pdb; pdb.set_trace() try: + row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): if row.row_key in self._mutations.keys(): mutation_val = self._mutations[row.row_key][1] From fa7a4d064c149b5349bf0d9f2cdafc2791a3a7ae Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 17:29:58 +0100 Subject: [PATCH 336/616] added tests for preload ids --- tests/unit/stores/test_bigtable.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index b2a44e60a..366fc5254 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -224,6 +224,23 @@ def test_fill_if_empty(self, manager): assert manager._finished_preloads == {b"\x13", b"\x10"} assert manager.contains(key) + def test_fill_if_empty(self, manager): + key = b"\x13AAA" + manager.bt_table.add_test_data({key}) + # Scenario 1: Everything empty + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._finished_preloads == {b"\x13"} + + manager._fill_if_empty({key}) + assert manager.bt_table.read_rows.call_count == 1 + assert manager._finished_preloads == {b"\x13"} + + manager._fill_if_empty({b"\x10XXX"}) + assert manager.bt_table.read_rows.call_count == 2 + assert manager._finished_preloads == {b"\x13", b"\x10"} + assert manager.contains(key) + def test_fill_if_empty_with_pre_and_suffix(self, manager): manager.get_preload_prefix_len = lambda _: 3 @@ -1088,3 +1105,33 @@ def test_revoke_partitions(self, store): store.revoke_partitions({TP1, TP2}) store._cache.delete_partition.assert_any_call(1) store._cache.delete_partition.assert_any_call(2) + + def test_fill_with_custom_key_prefix(self, store): + def to_bt_key(key): + prefix_end = 5 + p1_end = prefix_end + 1 + key[prefix_end] // 2 + key_prefix = key[p1_end+2:] + return key_prefix + key + + def from_bt_key(key): + return key[key.find(b'\x00\x00\x00'):] + + def get_preload_prefix_len(key) -> int: + return len(key[:key.find(b'\x00\x00\x00')]) + + k = ( + b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' + b'\x000624ea584630eccac35c92d57' + ) + store._cache.get_preload_prefix_len = get_preload_prefix_len + store._transform_key_from_bt = from_bt_key + store._transform_key_to_bt = to_bt_key + + partition = 19 + res = store._get_key_with_partition(k, partition) + import pdb; pdb.set_trace() + preload_id = b'\x13624ea584630eccac35c92d57' + assert store._cache._preload_id_from_key(res) == preload_id + assert res == preload_id + k + + From 70b85727337201cd13bee81b2b4bfcb62de35132 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 17:31:18 +0100 Subject: [PATCH 337/616] also log bt_keys --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 55f392519..a97efd792 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -149,7 +149,7 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._value_cache[row.row_key] = value yield row.row_key except Exception as e: - self.log.info(f"BigTableStore fill failed for {preload_ids_todo=}") + self.log.info(f"BigTableStore fill failed for {preload_ids_todo=}, {bt_keys=}") raise e end = time.time() self.log.info( From 21808c26adfc746386f645dc9fbe8b9a1a44b4d6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 17:31:57 +0100 Subject: [PATCH 338/616] removed pdb --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a97efd792..34c973197 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -136,7 +136,6 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): start = time.time() if self._value_cache is not None: - import pdb; pdb.set_trace() try: row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): From 20a565fd8c890bd319d931b3d1598a1cb0c6f5e5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 18:04:29 +0100 Subject: [PATCH 339/616] fixed default setting of bt_table_key_translator --- faust/stores/bigtable.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 34c973197..8bb12577b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -311,18 +311,15 @@ def __init__( raise ex super().__init__(url, app, table, **kwargs) self._db_lock = asyncio.Lock() - self.rebalance_ack = False @staticmethod def default_translator(user_key): return user_key def _set_options(self, options) -> None: - self._transform_key_to_bt = options.get( - BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY[0], self.default_translator - ) - self._transform_key_from_bt = options.get( - BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY[1], self.default_translator + self._transform_key_to_bt, self._transform_key_from_bt = options.get( + BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY, + (self.default_translator, self.default_translator) ) self._all_options = options self.table_name_generator = options.get( From f09e4927a1904b223cff9ca73ec36364fb113789 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Mar 2023 19:22:18 +0100 Subject: [PATCH 340/616] fixed testcase so it tests for a critical bug --- tests/unit/stores/test_bigtable.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 366fc5254..67f18b94d 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1108,9 +1108,12 @@ def test_revoke_partitions(self, store): def test_fill_with_custom_key_prefix(self, store): def to_bt_key(key): - prefix_end = 5 - p1_end = prefix_end + 1 + key[prefix_end] // 2 - key_prefix = key[p1_end+2:] + len_total = len(key) + len_prefix = 4 + len_num_bytes_len = key[len_prefix] // 2 + len_first_id = key[len_prefix + len_num_bytes_len] // 2 + len_second_id = key[len_prefix + 1 + len_num_bytes_len + len_first_id + 1] // 2 + key_prefix = key[len_total - len_second_id:] return key_prefix + key def from_bt_key(key): @@ -1120,18 +1123,20 @@ def get_preload_prefix_len(key) -> int: return len(key[:key.find(b'\x00\x00\x00')]) k = ( - b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' - b'\x000624ea584630eccac35c92d57' + b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' + b'\x000624ea584630eccac35c92d57' ) store._cache.get_preload_prefix_len = get_preload_prefix_len store._transform_key_from_bt = from_bt_key store._transform_key_to_bt = to_bt_key - partition = 19 + partition = 0 res = store._get_key_with_partition(k, partition) - import pdb; pdb.set_trace() - preload_id = b'\x13624ea584630eccac35c92d57' + preload_id = b'\x00624ea584630eccac35c92d57' assert store._cache._preload_id_from_key(res) == preload_id assert res == preload_id + k + assert k == store._transform_key_from_bt( + store._transform_key_to_bt(k) + ) From 7bf288a888836cb7a358525ed4cfc76d6850a88c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 13:21:03 +0100 Subject: [PATCH 341/616] changed some function names and added more tests --- faust/stores/bigtable.py | 27 +++--- tests/unit/stores/test_bigtable.py | 141 +++++++++++++++++++---------- 2 files changed, 107 insertions(+), 61 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8bb12577b..59f66a1c1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -457,15 +457,14 @@ def _get_partition_prefix(self, partition: int) -> bytes: partition_bytes = partition.to_bytes(1, "little") return b"".join([partition_bytes]) - def _remove_partition_prefix(self, key: bytes) -> bytes: + def _get_faust_key(self, key: bytes) -> bytes: key = key[1:] return self._transform_key_from_bt(key) - def _get_key_with_partition(self, key: bytes, partition: int) -> bytes: + def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: key = self._transform_key_to_bt(key) prefix = self._get_partition_prefix(partition) - key = b"".join([prefix, key]) - return key + return prefix + key def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: @@ -477,7 +476,7 @@ def _get(self, key: bytes) -> Optional[bytes]: try: partition = self._maybe_get_partition_from_message() if partition is not None: - key_with_partition = self._get_key_with_partition( + key_with_partition = self._get_bigtable_key( key, partition=partition ) @@ -488,7 +487,7 @@ def _get(self, key: bytes) -> Optional[bytes]: else: keys = set() for partition in self._partitions_for_key(key): - key_with_partition = self._get_key_with_partition( + key_with_partition = self._get_bigtable_key( key, partition=partition ) keys.add(key_with_partition) @@ -511,7 +510,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_key_with_partition(key, partition=partition) + key_with_partition = self._get_bigtable_key(key, partition=partition) self._bigtable_set(key_with_partition, value) self._cache.set_partition(key, partition) except Exception as ex: @@ -525,7 +524,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: for partition in self._partitions_for_key(key): - key_with_partition = self._get_key_with_partition( + key_with_partition = self._get_bigtable_key( key, partition=partition ) self._bigtable_del(key_with_partition) @@ -559,7 +558,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row_set=row_set, filter_=self.row_filter ): yield ( - self._remove_partition_prefix(row.row_key), + self._get_faust_key(row.row_key), self.bigtable_exrtact_row_data(row), ) except Exception as ex: @@ -585,7 +584,7 @@ def _iterkeys(self) -> Iterator[bytes]: found_mutations = set() for k, mut in self._cache._mutations.items(): if mut[1] is not None: - yield self._remove_partition_prefix(k) + yield self._get_faust_key(k) found_mutations.add(k) for row in self.bt_table.read_rows( @@ -600,7 +599,7 @@ def _iterkeys(self) -> Iterator[bytes]: preload_id = self._cache._preload_id_from_key(row.row_key) self._cache._finished_preloads.add(preload_id) partition = row.row_key[0] - key = self._remove_partition_prefix(row.row_key) + key = self._get_faust_key(row.row_key) self._cache.set_partition(key, partition) yield key @@ -635,14 +634,14 @@ def _contains(self, key: bytes) -> bool: return True partition = self._maybe_get_partition_from_message() if partition is not None: - key_with_partition = self._get_key_with_partition( + key_with_partition = self._get_bigtable_key( key, partition=partition ) return self._bigtable_contains(key_with_partition) else: keys_to_search = set() for partition in self._partitions_for_key(key): - key_with_partition = self._get_key_with_partition( + key_with_partition = self._get_bigtable_key( key, partition=partition ) keys_to_search.add(key_with_partition) @@ -741,7 +740,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_key_with_partition(msg.key, partition=tp.partition) + offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 67f18b94d..6cd6dd1d4 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -13,6 +13,26 @@ from faust.types.tuples import TP +def to_bt_key(key): + len_total = len(key) + len_prefix = 4 + len_num_bytes_len = key[len_prefix] // 2 + len_first_id = key[len_prefix + len_num_bytes_len] // 2 + len_second_id = key[ + len_prefix + 1 + len_num_bytes_len + len_first_id + 1 + ] // 2 + key_prefix = key[len_total - len_second_id:] + return key_prefix + key + + +def from_bt_key(key): + return key[key.find(bytes(4)):] + + +def get_preload_prefix_len(key) -> int: + return len(key[:key.find(bytes(4))]) + + class MyTestResponse: def __init__(self, code) -> None: self.code = code @@ -172,6 +192,7 @@ def test_iscomplete__init__(self): time.time = MagicMock(return_value=0) options = { BigTableStore.VALUE_CACHE_ENABLE_KEY: True, + BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY: get_preload_prefix_len } test_manager = BigTableCacheManager(MagicMock(), options, bigtable_mock) @@ -181,6 +202,7 @@ def test_iscomplete__init__(self): assert test_manager._last_flush == {} assert test_manager._mutations == {} assert test_manager._finished_preloads == set() + assert test_manager.get_preload_prefix_len == get_preload_prefix_len @pytest.fixture() def bt_imports(self): @@ -517,16 +539,34 @@ async def test_bigtable_set_options(self, bt_imports): bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") bt_imports.column_family = MagicMock(return_value=MagicMock()) name_lambda = lambda x: print(x) # noqa + + def to_bt_key(key): + len_total = len(key) + len_prefix = 4 + len_num_bytes_len = key[len_prefix] // 2 + len_first_id = key[len_prefix + len_num_bytes_len] // 2 + len_second_id = key[ + len_prefix + 1 + len_num_bytes_len + len_first_id + 1 + ] // 2 + key_prefix = key[len_total - len_second_id:] + return key_prefix + key + + def from_bt_key(key): + return key[key.find(b'\x00\x00\x00'):] + options = { BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", BigTableStore.BT_COLUMN_NAME_KEY: "name_test", + BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY: (to_bt_key, from_bt_key), } BigTableStore._set_options(self_mock, options) assert self_mock.column_name == "name_test" assert self_mock.offset_key_prefix == "offset_test" assert self_mock.row_filter == "a_filter" assert self_mock.table_name_generator == name_lambda + assert self_mock._transform_key_to_bt == to_bt_key + assert self_mock._transform_key_from_bt == from_bt_key @pytest.mark.asyncio async def test_bigtable_setup(self, bt_imports): @@ -773,16 +813,16 @@ def test_get_partition_prefix(self, store): res = store._get_partition_prefix(partition) assert res[0] == partition - def test_remove_partition_prefix(self, store): + def test_get_faust_key(self, store): key_with_partition = b"\x13THEACTUALKEY" - res = store._remove_partition_prefix(key_with_partition) + res = store._get_faust_key(key_with_partition) assert res == b"THEACTUALKEY" def test_get_key_with_partition(self, store): partition = 19 - res = store._get_key_with_partition(self.TEST_KEY1, partition) + res = store._get_bigtable_key(self.TEST_KEY1, partition) assert res[0] == partition - assert store._remove_partition_prefix(res) == self.TEST_KEY1 + assert store._get_faust_key(res) == self.TEST_KEY1 def test_partitions_for_key(self, store): store._cache.get_partition = MagicMock(return_value=19) @@ -803,7 +843,7 @@ def test_get_with_known_partition(self, store): # Scenario: Found store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) - key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) + key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) store._bigtable_get.assert_called_once_with(key_with_partition) store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) assert res == b"a_value" @@ -812,7 +852,7 @@ def test_get_with_known_partition(self, store): # Scenario: Not Found store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY1) - key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) + key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) store._bigtable_get.assert_called_once_with(key_with_partition) store._cache.set_partition.assert_not_called() assert res is None @@ -822,12 +862,12 @@ def test_get_with_unknown_partition(self, store): store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) store._cache.set_partition = MagicMock() keys_searched = set() - keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 3)) - keys_searched.add(store._get_key_with_partition(self.TEST_KEY1, 19)) + keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 1)) + keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 3)) + keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 19)) # Scenario: Found - key_of_value = store._get_key_with_partition(self.TEST_KEY1, 19) + key_of_value = store._get_bigtable_key(self.TEST_KEY1, 19) store._bigtable_get_range = MagicMock(return_value=(key_of_value, b"a_value")) res = store._get(self.TEST_KEY1) store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) @@ -849,7 +889,7 @@ def test_set(self, store): store._bigtable_set = MagicMock() store._cache.set_partition = MagicMock() store._set(self.TEST_KEY1, b"a_value") - key_with_partition = store._get_key_with_partition(self.TEST_KEY1, partition) + key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) store._bigtable_set.assert_called_once_with(key_with_partition, b"a_value") store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) @@ -859,9 +899,9 @@ def test_del(self, store): store._bigtable_del = MagicMock() store._del(self.TEST_KEY1) calls = [ - call(store._get_key_with_partition(self.TEST_KEY1, 1)), - call(store._get_key_with_partition(self.TEST_KEY1, 3)), - call(store._get_key_with_partition(self.TEST_KEY1, 19)), + call(store._get_bigtable_key(self.TEST_KEY1, 1)), + call(store._get_bigtable_key(self.TEST_KEY1, 3)), + call(store._get_bigtable_key(self.TEST_KEY1, 19)), ] store._bigtable_del.assert_has_calls(calls) assert store._cache._partition_cache == {} @@ -889,9 +929,9 @@ def test_active_partitions(self, store): def test_iteritems(self, store): keys_in_store = [] - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) store.bt_table.add_test_data(keys_in_store) store._active_partitions = MagicMock(return_value=[1, 3]) @@ -905,9 +945,9 @@ def test_iterkeys(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) store._cache._partition_cache.limit = 3 keys_in_store = [] - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) store.bt_table.add_test_data(keys_in_store) all_res = sorted(store._iterkeys()) @@ -919,9 +959,9 @@ def test_iterkeys(self, store): def test_iterkeys_with_mutattions(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) keys_in_store = [] - k1 = store._get_key_with_partition(self.TEST_KEY1, 1) - k2 = store._get_key_with_partition(self.TEST_KEY2, 2) - k3 = store._get_key_with_partition(self.TEST_KEY3, 3) + k1 = store._get_bigtable_key(self.TEST_KEY1, 1) + k2 = store._get_bigtable_key(self.TEST_KEY2, 2) + k3 = store._get_bigtable_key(self.TEST_KEY3, 3) keys_in_store.append(k1) keys_in_store.append(k2) keys_in_store.append(k3) @@ -936,9 +976,9 @@ def test_iterkeys_with_mutattions(self, store): def test_itervalues(self, store): keys_in_store = [] - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_key_with_partition(self.TEST_KEY3, 3)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) + keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) store.bt_table.add_test_data(keys_in_store) store._active_partitions = MagicMock(return_value=[1, 3]) @@ -966,7 +1006,7 @@ def test_contains_with_known_partition(self, store): # Scenario1: Found store._bigtable_contains = MagicMock(return_value="TRUE_OR_FALSE") - key_w_partition = store._get_key_with_partition(self.TEST_KEY1, 19) + key_w_partition = store._get_bigtable_key(self.TEST_KEY1, 19) res = store._contains(self.TEST_KEY1) store._bigtable_contains.assert_called_once_with(key_w_partition) assert res == "TRUE_OR_FALSE" @@ -979,9 +1019,9 @@ def test_contains_with_unknown_partition(self, store): store._bigtable_contains_any = MagicMock(return_value="TRUE_OR_FALSE") keys_to_search = set() - keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 1)) - keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 3)) - keys_to_search.add(store._get_key_with_partition(self.TEST_KEY1, 19)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 1)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 3)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 19)) res = store._contains(self.TEST_KEY1) @@ -1106,22 +1146,7 @@ def test_revoke_partitions(self, store): store._cache.delete_partition.assert_any_call(1) store._cache.delete_partition.assert_any_call(2) - def test_fill_with_custom_key_prefix(self, store): - def to_bt_key(key): - len_total = len(key) - len_prefix = 4 - len_num_bytes_len = key[len_prefix] // 2 - len_first_id = key[len_prefix + len_num_bytes_len] // 2 - len_second_id = key[len_prefix + 1 + len_num_bytes_len + len_first_id + 1] // 2 - key_prefix = key[len_total - len_second_id:] - return key_prefix + key - - def from_bt_key(key): - return key[key.find(b'\x00\x00\x00'):] - - def get_preload_prefix_len(key) -> int: - return len(key[:key.find(b'\x00\x00\x00')]) - + def test__fill_with_custom_key_prefix(self, store): k = ( b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' b'\x000624ea584630eccac35c92d57' @@ -1131,7 +1156,7 @@ def get_preload_prefix_len(key) -> int: store._transform_key_to_bt = to_bt_key partition = 0 - res = store._get_key_with_partition(k, partition) + res = store._get_bigtable_key(k, partition) preload_id = b'\x00624ea584630eccac35c92d57' assert store._cache._preload_id_from_key(res) == preload_id assert res == preload_id + k @@ -1139,4 +1164,26 @@ def get_preload_prefix_len(key) -> int: store._transform_key_to_bt(k) ) + def test_contains_with_unknown_partition_and_key_transform(self, store): + k = ( + b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' + b'\x000624ea584630eccac35c92d57' + ) + store._cache.get_preload_prefix_len = get_preload_prefix_len + store._transform_key_from_bt = from_bt_key + store._transform_key_to_bt = to_bt_key + store.app.conf.store_check_exists = True + store._maybe_get_partition_from_message = MagicMock(return_value=None) + store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) + store._cache.contains_any = MagicMock(wraps=store._cache.contains_any) + store._bigtable_contains_any = MagicMock(wraps=store._bigtable_contains_any) + keys_to_search = set() + keys_to_search.add(store._get_bigtable_key(k, 1)) + keys_to_search.add(store._get_bigtable_key(k, 3)) + keys_to_search.add(store._get_bigtable_key(k, 19)) + + res = store._contains(k) + res_contains = store._bigtable_contains_any.assert_called_once_with(keys_to_search) + assert res_contains is None + assert res is False From 73c82f9ad30d0aa72a9a2c495fcbd82f5f3cc6be Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 13:28:54 +0100 Subject: [PATCH 342/616] added logs for transforming keys --- faust/stores/bigtable.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 59f66a1c1..57cdca402 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -303,6 +303,7 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) + self._log_counter = 0 try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) @@ -458,13 +459,21 @@ def _get_partition_prefix(self, partition: int) -> bytes: return b"".join([partition_bytes]) def _get_faust_key(self, key: bytes) -> bytes: - key = key[1:] - return self._transform_key_from_bt(key) + key_with_no_partition = key[1:] + new_key = self._transform_key_from_bt(key_with_no_partition) + if (self._log_counter % 100) == 0: + self.log.info(f"Transformed {key=} to {new_key} with {self._transform_key_from_bt.__name__}") + self._log_counter += 1 + return new_key def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: key = self._transform_key_to_bt(key) prefix = self._get_partition_prefix(partition) - return prefix + key + new_key = prefix + key + if (self._log_counter % 100) == 0: + self.log.info(f"Transformed {key=} to {new_key} with {self._transform_key_to_bt.__name__}") + self._log_counter += 1 + return new_key def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: From 51c5bc37b87b21ba67bd950b13d873fef6b7581d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 14:02:41 +0100 Subject: [PATCH 343/616] fixed wrong setting name --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 57cdca402..80555917c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -288,7 +288,7 @@ class BigTableStore(base.SerializedStore): BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" BT_MUTATION_FREQ_KEY = "bt_mutation_freq_key" BT_MAX_MUTATIONS = "bt_max_mutations" - BT_CUSTOM_KEY_TRANSLATOR_KEY = "bt_custom_key_translator" + BT_CUSTOM_KEY_TRANSLATOR_KEY = "bt_custom_key_translator_key" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" From 6106758b7644d43854bd0b487e6d3e6ecb050f9b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 16:02:13 +0100 Subject: [PATCH 344/616] added logging to find bug --- faust/stores/bigtable.py | 9 ++++- tests/unit/stores/test_bigtable.py | 56 +++++++++++++++++++++++++----- 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 80555917c..bc4f94b7c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -749,7 +749,14 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) + try: + offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) + except Exception as e: + logging.getLogger(__name__).warning( + f"BigTableStore: failed to get offset_key for {msg.key=}" + ) + raise e + row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 6cd6dd1d4..b01d369c3 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -14,6 +14,7 @@ def to_bt_key(key): + key = from_bt_key(key) # Just a safety meassure len_total = len(key) len_prefix = 4 len_num_bytes_len = key[len_prefix] // 2 @@ -514,6 +515,10 @@ class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" TEST_KEY2 = b"TEST_KEY2" TEST_KEY3 = b"TEST_KEY3" + TEST_KEY4 = ( + b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' + b'\x000624ea584630eccac35c92d57' + ) @pytest.fixture() def bt_imports(self): @@ -1165,10 +1170,6 @@ def test__fill_with_custom_key_prefix(self, store): ) def test_contains_with_unknown_partition_and_key_transform(self, store): - k = ( - b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' - b'\x000624ea584630eccac35c92d57' - ) store._cache.get_preload_prefix_len = get_preload_prefix_len store._transform_key_from_bt = from_bt_key store._transform_key_to_bt = to_bt_key @@ -1179,11 +1180,50 @@ def test_contains_with_unknown_partition_and_key_transform(self, store): store._cache.contains_any = MagicMock(wraps=store._cache.contains_any) store._bigtable_contains_any = MagicMock(wraps=store._bigtable_contains_any) keys_to_search = set() - keys_to_search.add(store._get_bigtable_key(k, 1)) - keys_to_search.add(store._get_bigtable_key(k, 3)) - keys_to_search.add(store._get_bigtable_key(k, 19)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 1)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 3)) + keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 19)) - res = store._contains(k) + res = store._contains(self.TEST_KEY4) res_contains = store._bigtable_contains_any.assert_called_once_with(keys_to_search) assert res_contains is None assert res is False + + def test_apply_changelog_batch_with_key_transform(self, store): + store._transform_key_from_bt = from_bt_key + store._transform_key_to_bt = to_bt_key + + row_mock = MagicMock() + row_mock.delete = MagicMock() + row_mock.set_cell = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store.bt_table.mutate_rows = MagicMock() + store._persist_changelog_batch = MagicMock() + + class TestMessage: + def __init__(self, value, key, tp, offset): + self.value = value + self.key = key + self.tp = tp + self.offset = offset + + class TestEvent: + def __init__(self, message): + self.message = message + + tp = TP("a", 19) + tp2 = TP("b", 19) + messages = [ + TestEvent(TestMessage("a", self.TEST_KEY4, tp, 0)), + TestEvent(TestMessage(None, self.TEST_KEY4, tp, 1)), # Delete + TestEvent(TestMessage("a", self.TEST_KEY4, tp, 3)), # Out of order + TestEvent(TestMessage("b", self.TEST_KEY4, tp2, 4)), + TestEvent(TestMessage("a", self.TEST_KEY4, tp, 2)), + ] + store.apply_changelog_batch(messages, lambda x: x, lambda x: x) + assert store.bt_table.direct_row.call_count == 5 + row_mock.delete.assert_called_once() + assert row_mock.set_cell.call_count == 4 + store._persist_changelog_batch.assert_called_once() + tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] + assert tp_offsets == {tp: 3, tp2: 4} From 4f5f1768cb9027432b377e9063638746c9d35863 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 16:14:48 +0100 Subject: [PATCH 345/616] logging --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bc4f94b7c..ce15f4ab3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -754,6 +754,7 @@ def apply_changelog_batch( except Exception as e: logging.getLogger(__name__).warning( f"BigTableStore: failed to get offset_key for {msg.key=}" + f" for {self.table_name}" ) raise e From 44f3e36af29bf4f06299e9e8a76771f80976526e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 16:47:00 +0100 Subject: [PATCH 346/616] removed logs --- faust/stores/bigtable.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ce15f4ab3..02a7aa798 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -461,18 +461,12 @@ def _get_partition_prefix(self, partition: int) -> bytes: def _get_faust_key(self, key: bytes) -> bytes: key_with_no_partition = key[1:] new_key = self._transform_key_from_bt(key_with_no_partition) - if (self._log_counter % 100) == 0: - self.log.info(f"Transformed {key=} to {new_key} with {self._transform_key_from_bt.__name__}") - self._log_counter += 1 return new_key def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: key = self._transform_key_to_bt(key) prefix = self._get_partition_prefix(partition) new_key = prefix + key - if (self._log_counter % 100) == 0: - self.log.info(f"Transformed {key=} to {new_key} with {self._transform_key_to_bt.__name__}") - self._log_counter += 1 return new_key def _partitions_for_key(self, key: bytes) -> Iterable[int]: @@ -749,15 +743,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - try: - offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) - except Exception as e: - logging.getLogger(__name__).warning( - f"BigTableStore: failed to get offset_key for {msg.key=}" - f" for {self.table_name}" - ) - raise e - + offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() From da45321183b0fcb5be04e1a7d13ff476e2481387 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 18:27:35 +0100 Subject: [PATCH 347/616] added testcases for preload ids --- tests/unit/stores/test_bigtable.py | 61 +++++++++++++++++++----------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index b01d369c3..0265bb165 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -14,24 +14,27 @@ def to_bt_key(key): - key = from_bt_key(key) # Just a safety meassure len_total = len(key) - len_prefix = 4 - len_num_bytes_len = key[len_prefix] // 2 - len_first_id = key[len_prefix + len_num_bytes_len] // 2 - len_second_id = key[ - len_prefix + 1 + len_num_bytes_len + len_first_id + 1 - ] // 2 + len_prefix = 5 + len_first_id = key[len_prefix] // 2 + if len_prefix + 1 + len_first_id + 1 >= len_total: + # This happens if there is e.g. no organisation id + return key + len_second_id = key[len_prefix + 1 + + len_first_id + 1] // 2 key_prefix = key[len_total - len_second_id:] return key_prefix + key - def from_bt_key(key): - return key[key.find(bytes(4)):] - + magic_byte_pos = key.find(bytes(4)) + if magic_byte_pos == 0: + return key + return key[magic_byte_pos:] def get_preload_prefix_len(key) -> int: - return len(key[:key.find(bytes(4))]) + preload_len = key.find(bytes(4)) + if preload_len == 0: + return len(key) + return preload_len class MyTestResponse: @@ -516,8 +519,13 @@ class TestBigTableStore: TEST_KEY2 = b"TEST_KEY2" TEST_KEY3 = b"TEST_KEY3" TEST_KEY4 = ( - b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' - b'\x000624ea584630eccac35c92d57' + b'\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d' + ) + TEST_KEY5 = ( + b'\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2' + ) + TEST_KEY6 = ( + b'\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02' ) @pytest.fixture() @@ -1152,21 +1160,28 @@ def test_revoke_partitions(self, store): store._cache.delete_partition.assert_any_call(2) def test__fill_with_custom_key_prefix(self, store): - k = ( - b'\x00\x00\x00\x00\x020624ea584630eccac35c92d57' - b'\x000624ea584630eccac35c92d57' - ) store._cache.get_preload_prefix_len = get_preload_prefix_len store._transform_key_from_bt = from_bt_key store._transform_key_to_bt = to_bt_key partition = 0 - res = store._get_bigtable_key(k, partition) - preload_id = b'\x00624ea584630eccac35c92d57' - assert store._cache._preload_id_from_key(res) == preload_id - assert res == preload_id + k - assert k == store._transform_key_from_bt( - store._transform_key_to_bt(k) + for k in [self.TEST_KEY4, self.TEST_KEY5]: + res = store._get_bigtable_key(k, partition) + expected_preload_id = b'\x00' + k[-24:] + preload_id = store._cache._preload_id_from_key(res) + assert preload_id == expected_preload_id + assert res == expected_preload_id + k + assert k == store._transform_key_from_bt( + store._transform_key_to_bt(k) + ) + + res = store._get_bigtable_key(self.TEST_KEY6, partition) + expected_preload_id = b'\x00' + self.TEST_KEY6 + preload_id = store._cache._preload_id_from_key(res) + assert preload_id == expected_preload_id + assert res == expected_preload_id + assert self.TEST_KEY6 == store._transform_key_from_bt( + store._transform_key_to_bt(self.TEST_KEY6) ) def test_contains_with_unknown_partition_and_key_transform(self, store): From c77e56d2057645d247af64e38651edbe46b75df9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Mar 2023 18:27:59 +0100 Subject: [PATCH 348/616] format --- tests/unit/stores/test_bigtable.py | 185 ++++++++++++++++++++--------- 1 file changed, 128 insertions(+), 57 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 0265bb165..05858bf9c 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -20,16 +20,18 @@ def to_bt_key(key): if len_prefix + 1 + len_first_id + 1 >= len_total: # This happens if there is e.g. no organisation id return key - len_second_id = key[len_prefix + 1 + + len_first_id + 1] // 2 - key_prefix = key[len_total - len_second_id:] + len_second_id = key[len_prefix + 1 + +len_first_id + 1] // 2 + key_prefix = key[len_total - len_second_id :] return key_prefix + key + def from_bt_key(key): magic_byte_pos = key.find(bytes(4)) - if magic_byte_pos == 0: + if magic_byte_pos == 0: return key return key[magic_byte_pos:] + def get_preload_prefix_len(key) -> int: preload_len = key.find(bytes(4)) if preload_len == 0: @@ -49,7 +51,9 @@ class RowSetMock: def __init__(self) -> None: self.keys = set() self.add_row_key = MagicMock(wraps=self._add_row_key) - self.add_row_range_from_keys = MagicMock(wraps=self._add_row_range_from_keys) + self.add_row_range_from_keys = MagicMock( + wraps=self._add_row_range_from_keys + ) def _add_row_key(self, key): self.keys.add(key) @@ -196,10 +200,12 @@ def test_iscomplete__init__(self): time.time = MagicMock(return_value=0) options = { BigTableStore.VALUE_CACHE_ENABLE_KEY: True, - BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY: get_preload_prefix_len + BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY: get_preload_prefix_len, } - test_manager = BigTableCacheManager(MagicMock(), options, bigtable_mock) + test_manager = BigTableCacheManager( + MagicMock(), options, bigtable_mock + ) assert test_manager.bt_table == bigtable_mock assert isinstance(test_manager._value_cache, BigTableValueCache) assert test_manager._mut_freq == 0 @@ -212,14 +218,18 @@ def test_iscomplete__init__(self): def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock(return_value="a_rule") + bt.column_family.MaxVersionsGCRule = MagicMock( + return_value="a_rule" + ) bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt @pytest.fixture() def manager(self, bt_imports): with patch("faust.stores.bigtable.BT", bt_imports): - with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): bigtable_mock = BigTableMock() app_mock = MagicMock() app_mock.conf = MagicMock() @@ -229,7 +239,9 @@ def manager(self, bt_imports): BigTableStore.VALUE_CACHE_ENABLE_KEY: True, BigTableStore.BT_MUTATION_FREQ_KEY: 600, } - manager = BigTableCacheManager(MagicMock(), options, bigtable_mock) + manager = BigTableCacheManager( + MagicMock(), options, bigtable_mock + ) manager._partition_cache = {} return manager @@ -406,13 +418,19 @@ def test_flush_if_timer_over(self, manager): tp = TP("a_topic", partition=19) tp2 = TP("a_topic", partition=0) time.time = MagicMock(return_value=0) - manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(404)]) + manager.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(404)] + ) row_mock = MagicMock() row_mock.row_key = b"\x13AAA" - manager._mutations = {row_mock.row_key: (row_mock, "some_row_mutation")} + manager._mutations = { + row_mock.row_key: (row_mock, "some_row_mutation") + } - with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): assert manager.flush_if_timer_over(tp) is True assert manager._last_flush == {tp.partition: 0} assert manager.flush_if_timer_over(tp) is False @@ -436,7 +454,9 @@ def test_flush_if_timer_over(self, manager): assert manager.flush_if_timer_over(tp) is False manager._last_flush = {} - manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)]) + manager.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(0)] + ) with patch( "faust.stores.bigtable.time.time", @@ -450,11 +470,17 @@ def test_flush_if_timer_over_on_max_count(self, manager): tp = TP("a_topic", partition=19) row_mock = MagicMock() row_mock.row_key = b"\x13AAA" - manager._mutations = {row_mock.row_key: (row_mock, "some_row_mutation")} + manager._mutations = { + row_mock.row_key: (row_mock, "some_row_mutation") + } manager._max_mutations = 1 manager._last_flush = {tp.partition: 999999999999} - manager.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)]) - with patch("faust.stores.bigtable.time.time", MagicMock(return_value=0)): + manager.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(0)] + ) + with patch( + "faust.stores.bigtable.time.time", MagicMock(return_value=0) + ): assert manager.flush_if_timer_over(tp) is True def test_set_mutation(self, manager): @@ -518,21 +544,17 @@ class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" TEST_KEY2 = b"TEST_KEY2" TEST_KEY3 = b"TEST_KEY3" - TEST_KEY4 = ( - b'\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d' - ) - TEST_KEY5 = ( - b'\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2' - ) - TEST_KEY6 = ( - b'\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02' - ) + TEST_KEY4 = b"\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d" + TEST_KEY5 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2" + TEST_KEY6 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02" @pytest.fixture() def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock(return_value="a_rule") + bt.column_family.MaxVersionsGCRule = MagicMock( + return_value="a_rule" + ) bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt @@ -558,20 +580,23 @@ def to_bt_key(key): len_prefix = 4 len_num_bytes_len = key[len_prefix] // 2 len_first_id = key[len_prefix + len_num_bytes_len] // 2 - len_second_id = key[ - len_prefix + 1 + len_num_bytes_len + len_first_id + 1 - ] // 2 - key_prefix = key[len_total - len_second_id:] + len_second_id = ( + key[len_prefix + 1 + len_num_bytes_len + len_first_id + 1] // 2 + ) + key_prefix = key[len_total - len_second_id :] return key_prefix + key def from_bt_key(key): - return key[key.find(b'\x00\x00\x00'):] + return key[key.find(b"\x00\x00\x00") :] options = { BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", BigTableStore.BT_COLUMN_NAME_KEY: "name_test", - BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY: (to_bt_key, from_bt_key), + BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY: ( + to_bt_key, + from_bt_key, + ), } BigTableStore._set_options(self_mock, options) assert self_mock.column_name == "name_test" @@ -592,7 +617,9 @@ def table_name_gen(table): return table.name[::-1] self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) client_mock = MagicMock() instance_mock = MagicMock() @@ -626,7 +653,9 @@ def table_name_gen(table): # Test with no existing table self_mock.reset_mock() self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) table_mock.exists = MagicMock(return_value=False) return_value = BigTableStore._bigtable_setup( self_mock, faust_table_mock, options @@ -657,7 +686,9 @@ def test_bigtable_bigtable_get_on_empty(self, store): return_value = store._bigtable_get(self.TEST_KEY1) store._cache.contains.assert_called_with(self.TEST_KEY1) store._cache.get.assert_not_called() - store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY1, filter_="a_filter" + ) assert return_value is None def test_bigtable_bigtable_get_cache_miss(self, store): @@ -704,7 +735,9 @@ def test_bigtable_get_range_cache_miss(self, store): def test_bigtable_get_range_cache_hit(self, store): store._cache.get = MagicMock(return_value="cache_res") - result_value = store._bigtable_get_range([self.TEST_KEY1, self.TEST_KEY3]) + result_value = store._bigtable_get_range( + [self.TEST_KEY1, self.TEST_KEY3] + ) store.bt_table.read_rows.assert_not_called assert result_value == (self.TEST_KEY1, "cache_res") @@ -714,12 +747,16 @@ def test_bigtable_contains(self, store): store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY1, filter_="a_filter" + ) store._cache.delete.assert_not_called() assert return_value is True return_value = store._bigtable_contains(self.TEST_KEY2) - store.bt_table.read_row.assert_called_with(self.TEST_KEY2, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY2, filter_="a_filter" + ) store._cache.delete.assert_not_called() store._cache.delete.reset_mock() @@ -733,7 +770,9 @@ def test_bigtable_contains(self, store): store._cache.contains = MagicMock(return_value=False) return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with(self.TEST_KEY1, filter_="a_filter") + store.bt_table.read_row.assert_called_with( + self.TEST_KEY1, filter_="a_filter" + ) store._cache.delete.assert_not_called() assert return_value is True @@ -783,10 +822,14 @@ def test_bigtable_set(self, store): store._cache.set = MagicMock(return_value=None) store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1, persist_offset=True) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, persist_offset=True + ) store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) - store._cache.set.assert_called_once_with(self.TEST_KEY1, self.TEST_KEY1) + store._cache.set.assert_called_once_with( + self.TEST_KEY1, self.TEST_KEY1 + ) row_mock.set_cell.assert_called_once_with( store.column_family_id, store.column_name, @@ -851,14 +894,18 @@ def test_partitions_for_key(self, store): def test_get_with_known_partition(self, store): partition = 19 - store._maybe_get_partition_from_message = MagicMock(return_value=partition) + store._maybe_get_partition_from_message = MagicMock( + return_value=partition + ) store._cache.set_partition = MagicMock() # Scenario: Found store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) store._bigtable_get.assert_called_once_with(key_with_partition) - store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) + store._cache.set_partition.assert_called_once_with( + self.TEST_KEY1, partition + ) assert res == b"a_value" store._cache.set_partition.reset_mock() @@ -881,7 +928,9 @@ def test_get_with_unknown_partition(self, store): # Scenario: Found key_of_value = store._get_bigtable_key(self.TEST_KEY1, 19) - store._bigtable_get_range = MagicMock(return_value=(key_of_value, b"a_value")) + store._bigtable_get_range = MagicMock( + return_value=(key_of_value, b"a_value") + ) res = store._get(self.TEST_KEY1) store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) store._bigtable_get_range.assert_called_once_with(keys_searched) @@ -898,13 +947,19 @@ def test_get_with_unknown_partition(self, store): def test_set(self, store): partition = 19 - faust.stores.bigtable.get_current_partition = MagicMock(return_value=partition) + faust.stores.bigtable.get_current_partition = MagicMock( + return_value=partition + ) store._bigtable_set = MagicMock() store._cache.set_partition = MagicMock() store._set(self.TEST_KEY1, b"a_value") key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) - store._bigtable_set.assert_called_once_with(key_with_partition, b"a_value") - store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, partition) + store._bigtable_set.assert_called_once_with( + key_with_partition, b"a_value" + ) + store._cache.set_partition.assert_called_once_with( + self.TEST_KEY1, partition + ) def test_del(self, store): store._cache._partition_cache = {self.TEST_KEY1: 19} @@ -924,7 +979,9 @@ def test_active_partitions(self, store): TP("a_changelogtopic", 19), TP("a_different_chaneglogtopic", 19), ] - store.app.assignor.assigned_actives = MagicMock(return_value=active_topics) + store.app.assignor.assigned_actives = MagicMock( + return_value=active_topics + ) store.app.conf.topic_partitions = 20 store.table.changelog_topic_name = "a_changelogtopic" store.table.is_global = False @@ -1083,7 +1140,9 @@ def test_set_persisted_offset(self, store): def test_persist_changelog_batch(self, store): # Scenario 1: no failure - store.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(0)] * 10) + store.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(0)] * 10 + ) store.log = MagicMock() store.log.error = MagicMock() store.set_persisted_offset = MagicMock() @@ -1095,8 +1154,12 @@ def test_persist_changelog_batch(self, store): tp2: 222, tp3: 333, } - store._persist_changelog_batch(["row1", "row2", "etc..."], offset_batch) - store.bt_table.mutate_rows.assert_called_with(["row1", "row2", "etc..."]) + store._persist_changelog_batch( + ["row1", "row2", "etc..."], offset_batch + ) + store.bt_table.mutate_rows.assert_called_with( + ["row1", "row2", "etc..."] + ) assert store.set_persisted_offset.call_count == len(offset_batch) store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) @@ -1105,8 +1168,12 @@ def test_persist_changelog_batch(self, store): # Scenario 2: all failure store.set_persisted_offset.reset_mock() store.bt_table.mutate_rows.reset_mock() - store.bt_table.mutate_rows = MagicMock(return_value=[MyTestResponse(404)]) - store._persist_changelog_batch(["row1", "row2", "etc..."], offset_batch) + store.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(404)] + ) + store._persist_changelog_batch( + ["row1", "row2", "etc..."], offset_batch + ) # FIXME: I'm not sure if we want that behaviour. # Question: What should happen on a failed mutated row in recovery. store.set_persisted_offset.assert_called() @@ -1167,7 +1234,7 @@ def test__fill_with_custom_key_prefix(self, store): partition = 0 for k in [self.TEST_KEY4, self.TEST_KEY5]: res = store._get_bigtable_key(k, partition) - expected_preload_id = b'\x00' + k[-24:] + expected_preload_id = b"\x00" + k[-24:] preload_id = store._cache._preload_id_from_key(res) assert preload_id == expected_preload_id assert res == expected_preload_id + k @@ -1176,7 +1243,7 @@ def test__fill_with_custom_key_prefix(self, store): ) res = store._get_bigtable_key(self.TEST_KEY6, partition) - expected_preload_id = b'\x00' + self.TEST_KEY6 + expected_preload_id = b"\x00" + self.TEST_KEY6 preload_id = store._cache._preload_id_from_key(res) assert preload_id == expected_preload_id assert res == expected_preload_id @@ -1193,14 +1260,18 @@ def test_contains_with_unknown_partition_and_key_transform(self, store): store._maybe_get_partition_from_message = MagicMock(return_value=None) store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) store._cache.contains_any = MagicMock(wraps=store._cache.contains_any) - store._bigtable_contains_any = MagicMock(wraps=store._bigtable_contains_any) + store._bigtable_contains_any = MagicMock( + wraps=store._bigtable_contains_any + ) keys_to_search = set() keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 1)) keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 3)) keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 19)) res = store._contains(self.TEST_KEY4) - res_contains = store._bigtable_contains_any.assert_called_once_with(keys_to_search) + res_contains = store._bigtable_contains_any.assert_called_once_with( + keys_to_search + ) assert res_contains is None assert res is False From 0ce50a40361d3aa100d075505b9bb08b84f0218c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 23 Mar 2023 18:04:47 +0100 Subject: [PATCH 349/616] fixed tests for bigtable --- tests/unit/stores/test_bigtable.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 05858bf9c..6b92ddf48 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -124,6 +124,7 @@ def test_init(self): assert cache.data == {} assert cache.ttl == -1 assert cache.ttl_over is False + assert cache.is_complete is True # Test with custom size cache = BigTableValueCache(size=123) @@ -682,10 +683,8 @@ def store(self, bt_imports): def test_bigtable_bigtable_get_on_empty(self, store): store._cache.get = MagicMock(return_value=None) - store._cache.contains = MagicMock(return_value=False) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.contains.assert_called_with(self.TEST_KEY1) - store._cache.get.assert_not_called() + store._cache.get.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_with( self.TEST_KEY1, filter_="a_filter" ) @@ -696,7 +695,7 @@ def test_bigtable_bigtable_get_cache_miss(self, store): store._cache.contains = MagicMock(return_value=False) store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_not_called() + store._cache.get.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) @@ -743,37 +742,36 @@ def test_bigtable_get_range_cache_hit(self, store): def test_bigtable_contains(self, store): store._cache.contains = MagicMock(return_value=None) - store._cache.delete = MagicMock(return_value=None) store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_contains(self.TEST_KEY1) store.bt_table.read_row.assert_called_with( self.TEST_KEY1, filter_="a_filter" ) - store._cache.delete.assert_not_called() assert return_value is True return_value = store._bigtable_contains(self.TEST_KEY2) store.bt_table.read_row.assert_called_with( self.TEST_KEY2, filter_="a_filter" ) - store._cache.delete.assert_not_called() - store._cache.delete.reset_mock() store.bt_table.read_row.reset_mock() store._cache.contains = MagicMock(return_value=True) return_value = store._bigtable_contains(self.TEST_KEY1) store.bt_table.read_row.assert_not_called() - store._cache.delete.assert_not_called() assert return_value is True store._cache.contains = MagicMock(return_value=False) return_value = store._bigtable_contains(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + assert return_value is False + + store._cache.contains = MagicMock(return_value=None) + return_value = store._bigtable_contains(self.TEST_KEY1) store.bt_table.read_row.assert_called_with( self.TEST_KEY1, filter_="a_filter" ) - store._cache.delete.assert_not_called() assert return_value is True def test_bigtable_contains_any(self, store): From cff28e7a9ef1516048ebe4c9feb7b94fcb03f0c2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 23 Mar 2023 18:06:08 +0100 Subject: [PATCH 350/616] also test custom ttl --- tests/unit/stores/test_bigtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 6b92ddf48..f527130fc 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -130,6 +130,14 @@ def test_init(self): cache = BigTableValueCache(size=123) assert isinstance(cache.data, LRUCache) assert cache.data.limit == 123 + assert cache.is_complete is False + + # Test with custom ttl + cache = BigTableValueCache(ttl=123) + assert cache.data == {} + assert cache.ttl == 123 + assert cache.ttl_over is False + assert cache.is_complete is False def test__set_del_len_and_getitem(self): cache = BigTableValueCache() From 9782960de838ef2d21f0033f5a2a67b4177a7d07 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 23 Mar 2023 18:06:54 +0100 Subject: [PATCH 351/616] changed contains method to faster check the cache --- faust/stores/bigtable.py | 81 +++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 02a7aa798..920676a31 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -61,6 +61,7 @@ def __init__(self, ttl=-1, size: Optional[int] = None) -> None: self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) + self.is_complete = (ttl == -1) and (size is None) def __len__(self): return len(self.data) @@ -137,8 +138,12 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): start = time.time() if self._value_cache is not None: try: - row_set, row_filter = self._get_preload_rowset_and_filter(preload_ids_todo) - for row in self.bt_table.read_rows(row_set=row_set, filter_=row_filter): + row_set, row_filter = self._get_preload_rowset_and_filter( + preload_ids_todo + ) + for row in self.bt_table.read_rows( + row_set=row_set, filter_=row_filter + ): if row.row_key in self._mutations.keys(): mutation_val = self._mutations[row.row_key][1] if mutation_val is not None: @@ -148,7 +153,9 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._value_cache[row.row_key] = value yield row.row_key except Exception as e: - self.log.info(f"BigTableStore fill failed for {preload_ids_todo=}, {bt_keys=}") + self.log.info( + f"BigTableStore fill failed for {preload_ids_todo=}, {bt_keys=}" + ) raise e end = time.time() self.log.info( @@ -191,7 +198,10 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return self._mutations[bt_key][1] is not None if self._value_cache is not None: self._fill_if_empty({bt_key}) - return bt_key in self._value_cache.keys() + found = bt_key in self._value_cache.keys() + if self.is_complete: + return found + return True if found else None return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: @@ -201,7 +211,10 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return True if self._value_cache is not None: self._fill_if_empty(key_set) - return not self._value_cache.keys().isdisjoint(key_set) + found = not self._value_cache.keys().isdisjoint(key_set) + if self.is_complete: + return found + return True if found else None return None def flush_if_timer_over(self, tp: TP) -> bool: @@ -258,17 +271,25 @@ def _init_value_cache( enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) if enable: # TODO Maybe we need to remove invalidation time and size - ttl = options.get(BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1) + ttl = options.get( + BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 + ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) + self.is_complete = self._value_cache.is_complete else: self._value_cache = None + self.is_complete = False def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) # To prevent that all tables write at the same time - self._last_flush = {} # time.time() + self._mut_freq - random_start_offset - self._max_mutations = options.get(BigTableStore.BT_MAX_MUTATIONS, 10000) + self._last_flush = ( + {} + ) # time.time() + self._mut_freq - random_start_offset + self._max_mutations = options.get( + BigTableStore.BT_MAX_MUTATIONS, 10000 + ) self._mutations = {} @@ -320,13 +341,15 @@ def default_translator(user_key): def _set_options(self, options) -> None: self._transform_key_to_bt, self._transform_key_from_bt = options.get( BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY, - (self.default_translator, self.default_translator) + (self.default_translator, self.default_translator), ) self._all_options = options self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) - self.column_name = options.get(BigTableStore.BT_COLUMN_NAME_KEY, "DATA") + self.column_name = options.get( + BigTableStore.BT_COLUMN_NAME_KEY, "DATA" + ) self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" @@ -350,7 +373,9 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - self.column_family_id: BT.column_family.MaxVersionsGCRule(1) + self.column_family_id: BT.column_family.MaxVersionsGCRule( + 1 + ) } ) else: @@ -364,8 +389,9 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._cache.contains(key) is True: - return self._cache.get(key) + cached_value = self._cache.get(key) + if cached_value is not None: + return cached_value else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: @@ -377,7 +403,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is True: + if cache_contains is not None: return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) @@ -387,7 +413,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is True: + if cache_contains is not None: return cache_contains rows = BT.RowSet() @@ -424,7 +450,9 @@ def _bigtable_get_range( # Not found return None, None - def _bigtable_set(self, key: bytes, value: Optional[bytes], persist_offset=False): + def _bigtable_set( + self, key: bytes, value: Optional[bytes], persist_offset=False + ): if not persist_offset: # All mutatations set here will be flushed to BT later self._cache.set(key, value) @@ -502,7 +530,9 @@ def _get(self, key: bytes) -> Optional[bytes]: return value return None except KeyError as ke: - self.log.error(f"KeyError in get for table {self.table_name} for {key=}") + self.log.error( + f"KeyError in get for table {self.table_name} for {key=}" + ) raise ke except Exception as ex: self.log.error( @@ -513,7 +543,9 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: partition = get_current_partition() - key_with_partition = self._get_bigtable_key(key, partition=partition) + key_with_partition = self._get_bigtable_key( + key, partition=partition + ) self._bigtable_set(key_with_partition, value) self._cache.set_partition(key, partition) except Exception as ex: @@ -607,7 +639,9 @@ def _iterkeys(self) -> Iterator[bytes]: yield key end = time.time() - self.log.info(f"Finished iterkeys for {self.table_name} in {end - start}s") + self.log.info( + f"Finished iterkeys for {self.table_name} in {end - start}s" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -689,7 +723,9 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return offset return None - def set_persisted_offset(self, tp: TP, offset: int, recovery=False) -> None: + def set_persisted_offset( + self, tp: TP, offset: int, recovery=False + ) -> None: """Set the last persisted offset for this table. This will remember the last offset that we wrote to BigTableStore, @@ -743,7 +779,9 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_bigtable_key(msg.key, partition=tp.partition) + offset_key = self._get_bigtable_key( + msg.key, partition=tp.partition + ) row = self.bt_table.direct_row(offset_key) if msg.value is None: row.delete() @@ -795,7 +833,6 @@ def revoke_partitions(self, tps: Set[TP]) -> None: self._cache.delete_partition(tp.partition) gc.collect() - async def on_rebalance( self, assigned: Set[TP], From 4bc6c88583f7f51694707f1b60b84e2a8ffceabd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 14:02:30 +0200 Subject: [PATCH 352/616] table_contains really just checks the table now --- faust/stores/bigtable.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 920676a31..9fa5b7d50 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -402,9 +402,9 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return value def _bigtable_contains(self, key: bytes) -> bool: - cache_contains = self._cache.contains(key) - if cache_contains is not None: - return cache_contains + # cache_contains = self._cache.contains(key) + # if cache_contains is not None: + # return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: @@ -412,9 +412,9 @@ def _bigtable_contains(self, key: bytes) -> bool: return False def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: - cache_contains = self._cache.contains_any(keys) - if cache_contains is not None: - return cache_contains + # cache_contains = self._cache.contains_any(keys) + # if cache_contains is not None: + # return cache_contains rows = BT.RowSet() for key in keys: @@ -529,11 +529,6 @@ def _get(self, key: bytes) -> Optional[bytes]: self._cache.set_partition(key, partition) return value return None - except KeyError as ke: - self.log.error( - f"KeyError in get for table {self.table_name} for {key=}" - ) - raise ke except Exception as ex: self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" From 4a7a2720f141fe03f8f1b2ecc78952428df1fa67 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 16:12:23 +0200 Subject: [PATCH 353/616] try to fix contains --- faust/stores/bigtable.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9fa5b7d50..202102136 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -194,27 +194,29 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if bt_key in self._mutations.keys(): - return self._mutations[bt_key][1] is not None if self._value_cache is not None: self._fill_if_empty({bt_key}) found = bt_key in self._value_cache.keys() if self.is_complete: return found return True if found else None + + if bt_key in self._mutations.keys(): + return self._mutations[bt_key][1] is not None return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - mutations = key_set.intersection(self._mutations.keys()) - found = any(mut[1] is not None for mut in mutations) - if found: - return True if self._value_cache is not None: self._fill_if_empty(key_set) found = not self._value_cache.keys().isdisjoint(key_set) if self.is_complete: return found return True if found else None + + mutations = key_set.intersection(self._mutations.keys()) + found = any(mut[1] is not None for mut in mutations) + if found: + return True return None def flush_if_timer_over(self, tp: TP) -> bool: @@ -402,9 +404,9 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return value def _bigtable_contains(self, key: bytes) -> bool: - # cache_contains = self._cache.contains(key) - # if cache_contains is not None: - # return cache_contains + cache_contains = self._cache.contains(key) + if cache_contains is not None: + return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: @@ -412,9 +414,9 @@ def _bigtable_contains(self, key: bytes) -> bool: return False def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: - # cache_contains = self._cache.contains_any(keys) - # if cache_contains is not None: - # return cache_contains + cache_contains = self._cache.contains_any(keys) + if cache_contains is not None: + return cache_contains rows = BT.RowSet() for key in keys: From fe74fc52b50ba587bfee8e6e0b66fd0afc864a46 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 16:53:47 +0200 Subject: [PATCH 354/616] just return true on contains if table exists --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 202102136..d9eb4d29b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -405,7 +405,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is not None: + if cache_contains is True: return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) @@ -415,7 +415,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is not None: + if cache_contains is True: return cache_contains rows = BT.RowSet() From 3e0ea0c796f618fbda3bbcc07cf8ef38a686f629 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 17:16:44 +0200 Subject: [PATCH 355/616] return value from cache --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d9eb4d29b..4c713a741 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -391,9 +391,8 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - cached_value = self._cache.get(key) - if cached_value is not None: - return cached_value + if self._cache.contains(key): + return self._cache.get(key) else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: From 790044a6dda3afc7a612881ebf80addc76261e8b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 17:31:33 +0200 Subject: [PATCH 356/616] change get to also return None values if cached --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4c713a741..4e5d5fd5c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -391,7 +391,7 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._cache.contains(key): + if self._cache.contains(key) is not None: return self._cache.get(key) else: res = self.bt_table.read_row(key, filter_=self.row_filter) From e4e72f4a7a3a266029e5ab687f9f9a09b8d9cd44 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 17:52:55 +0200 Subject: [PATCH 357/616] fixed contains again --- faust/stores/bigtable.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4e5d5fd5c..1b401f79f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -194,6 +194,11 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ + # A mutation could be present in the buffer but not in + # in the table. + if bt_key in self._mutations.keys(): + return self._mutations[bt_key][1] is not None + if self._value_cache is not None: self._fill_if_empty({bt_key}) found = bt_key in self._value_cache.keys() @@ -201,11 +206,16 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return found return True if found else None - if bt_key in self._mutations.keys(): - return self._mutations[bt_key][1] is not None return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + # A mutation could be present in the buffer but not in + # in the table. + mutations = key_set.intersection(self._mutations.keys()) + found = any(mut[1] is not None for mut in mutations) + if found: + return True + if self._value_cache is not None: self._fill_if_empty(key_set) found = not self._value_cache.keys().isdisjoint(key_set) @@ -213,10 +223,6 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return found return True if found else None - mutations = key_set.intersection(self._mutations.keys()) - found = any(mut[1] is not None for mut in mutations) - if found: - return True return None def flush_if_timer_over(self, tp: TP) -> bool: @@ -272,7 +278,6 @@ def _init_value_cache( ) -> Optional[Union[LRUCache, BigTableValueCache]]: enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) if enable: - # TODO Maybe we need to remove invalidation time and size ttl = options.get( BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 ) @@ -404,8 +409,8 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) - if cache_contains is True: - return cache_contains + if cache_contains is not None: + return True row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: @@ -414,7 +419,7 @@ def _bigtable_contains(self, key: bytes) -> bool: def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: cache_contains = self._cache.contains_any(keys) - if cache_contains is True: + if cache_contains is not None: return cache_contains rows = BT.RowSet() From 079c26bed2f2aa31d6d8d906634b293685fc43c0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 17:54:09 +0200 Subject: [PATCH 358/616] changed contains --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1b401f79f..97886d5f6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -397,6 +397,8 @@ def bigtable_exrtact_row_data(row_data): def _bigtable_get(self, key: bytes) -> Optional[bytes]: if self._cache.contains(key) is not None: + # This means that we are sure that the value + # in the cache is either None or exists return self._cache.get(key) else: res = self.bt_table.read_row(key, filter_=self.row_filter) From e5082b9d8d6546e29a0a735a2c2cc2cd7c82ffac Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 17:59:46 +0200 Subject: [PATCH 359/616] fixed get range --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 97886d5f6..22ed4d0a7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -440,8 +440,8 @@ def _bigtable_get_range( ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: for key in keys: - value = self._cache.get(key) - if value is not None: + if self._cache.contains(key) is not None: + value = self._cache.get(key) return key, value rows = BT.RowSet() From 5fd237909492407aebab91940f7d6139118080fe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 18:25:06 +0200 Subject: [PATCH 360/616] return correct value in cache contains --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 22ed4d0a7..8503e73d9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -412,7 +412,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_contains(self, key: bytes) -> bool: cache_contains = self._cache.contains(key) if cache_contains is not None: - return True + return cache_contains row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: From 0f8977536911c0d122a2f22b4e8ad050de8a43f4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 28 Mar 2023 18:27:03 +0200 Subject: [PATCH 361/616] fixed tests --- tests/unit/stores/test_bigtable.py | 33 ++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index f527130fc..1fcbaeec1 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -690,20 +690,25 @@ def store(self, bt_imports): return store def test_bigtable_bigtable_get_on_empty(self, store): - store._cache.get = MagicMock(return_value=None) + store._cache.contains = MagicMock(return_value=False) + return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.contains.assert_called_with(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + assert return_value is None + + store._cache.contains = MagicMock(return_value=None) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_with(self.TEST_KEY1) + store._cache.contains.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_with( self.TEST_KEY1, filter_="a_filter" ) assert return_value is None + def test_bigtable_bigtable_get_cache_miss(self, store): - store._cache.get = MagicMock(return_value=None) - store._cache.contains = MagicMock(return_value=False) + store._cache.contains = MagicMock(return_value=None) store.bt_table.add_test_data([self.TEST_KEY1]) return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) @@ -718,8 +723,15 @@ def test_bigtable_bigtable_get_cache_hit(self, store): store.bt_table.read_row.assert_not_called() assert return_value == b"cache_res" + store._cache.contains = MagicMock(return_value=False) + store._cache.get = MagicMock(return_value=b"cache_res") + return_value = store._bigtable_get(self.TEST_KEY1) + store._cache.get.assert_called_once_with(self.TEST_KEY1) + store.bt_table.read_row.assert_not_called() + assert return_value == b"cache_res" + def test_bigtable_get_range_cache_miss(self, store): - store._cache.get = MagicMock(return_value=None) + store._cache.contains = MagicMock(return_value=None) test_keys_in = [self.TEST_KEY1, self.TEST_KEY3] # order is important test_keys_not_in = { @@ -898,6 +910,15 @@ def test_partitions_for_key(self, store): store._cache.get_partition.assert_called_once_with(self.TEST_KEY2) assert res == [1, 2, 3] + def test_get_keyerror(self, store): + partition = 19 + store._maybe_get_partition_from_message = MagicMock( + return_value=partition + ) + store._bigtable_get = MagicMock(return_value=None) + with pytest.raises(KeyError): + store[self.TEST_KEY1.decode()] + def test_get_with_known_partition(self, store): partition = 19 store._maybe_get_partition_from_message = MagicMock( From 93dba94dd593b63a358a57838080aab61bae29a7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 29 Mar 2023 14:20:02 +0200 Subject: [PATCH 362/616] fixed test for delete mutations on missing keys --- faust/stores/bigtable.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8503e73d9..7a63cc5cd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -196,7 +196,7 @@ def contains(self, bt_key: bytes) -> Optional[bool]: """ # A mutation could be present in the buffer but not in # in the table. - if bt_key in self._mutations.keys(): + if bt_key in self._mutations: return self._mutations[bt_key][1] is not None if self._value_cache is not None: @@ -211,8 +211,8 @@ def contains(self, bt_key: bytes) -> Optional[bool]: def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # A mutation could be present in the buffer but not in # in the table. - mutations = key_set.intersection(self._mutations.keys()) - found = any(mut[1] is not None for mut in mutations) + mutations = key_set.intersection(self._mutations) + found = any(self._mutations[mut][1] is not None for mut in mutations) if found: return True @@ -249,7 +249,7 @@ def flush_if_timer_over(self, tp: TP) -> bool: return flushed def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): - if bt_key in self._mutations.keys(): + if bt_key in self._mutations: row = self._mutations[bt_key][0] else: row = self.bt_table.direct_row(bt_key) @@ -290,10 +290,9 @@ def _init_value_cache( def _init_mutation_buffer(self, options): self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) - # To prevent that all tables write at the same time self._last_flush = ( {} - ) # time.time() + self._mut_freq - random_start_offset + ) self._max_mutations = options.get( BigTableStore.BT_MAX_MUTATIONS, 10000 ) @@ -403,7 +402,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: else: res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: - self.log.info(f"{key=} not found in {self.table_name}") + self.log.error(f"{key=} not found in {self.table_name}") value = None else: value = self.bigtable_exrtact_row_data(res) @@ -630,6 +629,7 @@ def _iterkeys(self) -> Iterator[bytes]: ): if row.row_key in found_mutations: continue + if self._cache._value_cache is not None: data = self.bigtable_exrtact_row_data(row) # We don't want to set mutations here From 8989a01f19a9e4bf8013af2958e0221039ef9da6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 29 Mar 2023 14:53:51 +0200 Subject: [PATCH 363/616] added return value that is true or false for mutations that were not set --- faust/stores/bigtable.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7a63cc5cd..f6410018d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -212,9 +212,9 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: # A mutation could be present in the buffer but not in # in the table. mutations = key_set.intersection(self._mutations) - found = any(self._mutations[mut][1] is not None for mut in mutations) - if found: - return True + if len(mutations) > 0: + found = any(self._mutations[mut][1] is not None for mut in mutations) + return found if self._value_cache is not None: self._fill_if_empty(key_set) From cbe6195090329842ca2cd3be30e4a51580122fa5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 31 Mar 2023 11:33:20 +0200 Subject: [PATCH 364/616] fixed tests and delete --- faust/stores/bigtable.py | 20 +++++++++++++------- tests/unit/stores/test_bigtable.py | 12 ++++++++---- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f6410018d..e23370924 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -209,12 +209,10 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - # A mutation could be present in the buffer but not in - # in the table. mutations = key_set.intersection(self._mutations) if len(mutations) > 0: found = any(self._mutations[mut][1] is not None for mut in mutations) - return found + found = False if self._value_cache is not None: self._fill_if_empty(key_set) @@ -438,10 +436,17 @@ def _bigtable_get_range( self, keys: Set[bytes] ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: + found_delete = False for key in keys: - if self._cache.contains(key) is not None: + is_cached = self._cache.contains(key) + if is_cached is True: value = self._cache.get(key) return key, value + elif is_cached is False: + found_delete = True + + if found_delete: + return None, None rows = BT.RowSet() for key in keys: @@ -464,6 +469,7 @@ def _bigtable_set( # All mutatations set here will be flushed to BT later self._cache.set(key, value) else: + # Except if we want to persist the current offset row = self.bt_table.direct_row(key) row.set_cell( self.column_family_id, @@ -473,10 +479,10 @@ def _bigtable_set( row.commit() def _bigtable_del(self, key: bytes): - row = self.bt_table.direct_row(key) + # Just operate on the cache, + # the mutation will be commited if the + # mutations buffer is flushed self._cache.delete(key) - row.delete() - row.commit() def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 1fcbaeec1..cdc0dfad0 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -754,6 +754,14 @@ def test_bigtable_get_range_cache_miss(self, store): def test_bigtable_get_range_cache_hit(self, store): store._cache.get = MagicMock(return_value="cache_res") + store._cache.contains = MagicMock(return_value=False) + result_value = store._bigtable_get_range( + [self.TEST_KEY1, self.TEST_KEY3] + ) + store.bt_table.read_rows.assert_not_called + assert result_value == (None, None) + + store._cache.contains = MagicMock(return_value=True) result_value = store._bigtable_get_range( [self.TEST_KEY1, self.TEST_KEY3] ) @@ -826,11 +834,7 @@ def test_bigtable_delete(self, store): store._cache.delete = MagicMock(return_value=None) store._bigtable_del(self.TEST_KEY1) - - store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) store._cache.delete.assert_called_once_with(self.TEST_KEY1) - row_mock.delete.assert_called_once() - row_mock.commit.assert_called_once() def test_bigtable_set(self, store): row_mock = MagicMock() From f9244a72eb9b39376b3005d16a410fb8cd2587a6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Apr 2023 15:31:51 +0200 Subject: [PATCH 365/616] added unit tests and fixed contains any --- faust/stores/bigtable.py | 8 +-- tests/unit/stores/test_bigtable.py | 82 +++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e23370924..5297e0f0b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -212,7 +212,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: mutations = key_set.intersection(self._mutations) if len(mutations) > 0: found = any(self._mutations[mut][1] is not None for mut in mutations) - found = False + return found if self._value_cache is not None: self._fill_if_empty(key_set) @@ -469,7 +469,7 @@ def _bigtable_set( # All mutatations set here will be flushed to BT later self._cache.set(key, value) else: - # Except if we want to persist the current offset + # Except if we want to persist the current offset row = self.bt_table.direct_row(key) row.set_cell( self.column_family_id, @@ -479,8 +479,8 @@ def _bigtable_set( row.commit() def _bigtable_del(self, key: bytes): - # Just operate on the cache, - # the mutation will be commited if the + # Just operate on the cache, + # the mutation will be commited if the # mutations buffer is flushed self._cache.delete(key) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index cdc0dfad0..be5030947 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -760,7 +760,7 @@ def test_bigtable_get_range_cache_hit(self, store): ) store.bt_table.read_rows.assert_not_called assert result_value == (None, None) - + store._cache.contains = MagicMock(return_value=True) result_value = store._bigtable_get_range( [self.TEST_KEY1, self.TEST_KEY3] @@ -1344,3 +1344,83 @@ def __init__(self, message): store._persist_changelog_batch.assert_called_once() tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] assert tp_offsets == {tp: 3, tp2: 4} + + + def test_modification_with_mutation_buffer(self, store): + + # Mocks + TEST_TP = TP("a", 0) + TEST_OFFSET = 0 + OFFSET_KEY = store.get_offset_key(TEST_TP).encode() + + def real_set_scenario(key, value, offset): + store._set(key, value) + store._bigtable_set.reset_mock() + store.set_persisted_offset(TEST_TP, offset) + return offset + 1 + + def real_del_scenario(key, offset): + store._del(key) + store._bigtable_set.reset_mock() + store.set_persisted_offset(TEST_TP, offset) + return offset + 1 + + def assert_offset_persisted(offset): + store._bigtable_set.assert_called_with( + OFFSET_KEY, str(offset).encode(), persist_offset=True + ) + + + row_mock = MagicMock() + row_mock.row_key = b"\x00TEST_KEY1" + + store._cache.bt_table.direct_row = MagicMock(return_value=row_mock) + store._cache.bt_table.mutate_rows = MagicMock( + return_value=[MyTestResponse(0)] * 1 + ) + store._cache._set_mutation = MagicMock(wraps=store._cache._set_mutation) + + time.time = MagicMock(return_value=0) + store._bigtable_set = MagicMock(wraps=store._bigtable_set) + partition = 0 + faust.stores.bigtable.get_current_partition = MagicMock( + return_value=partition + ) + store._cache.set_partition = MagicMock() + # Flush every 10 seconds + store._cache._mut_freq = 10 + store._cache._last_flush = {TEST_TP.partition: 0} + res = store._contains(self.TEST_KEY1) + store._bigtable_set.assert_not_called() + assert res is False + + TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + res = store._contains(self.TEST_KEY1) + assert res is True + assert store._cache._set_mutation.call_count == TEST_OFFSET + assert len(store._cache._mutations) == 1 + store._bigtable_set.assert_not_called() + + TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + res = store._contains(self.TEST_KEY1) + assert res is True + assert store._cache._set_mutation.call_count == TEST_OFFSET + assert len(store._cache._mutations) == 1 + store._bigtable_set.assert_not_called() + + time.time = MagicMock(return_value=20) # Now we should flush + + TEST_OFFSET = real_del_scenario(self.TEST_KEY1, TEST_OFFSET) + res = store._contains(self.TEST_KEY1) + assert res is False + + assert_offset_persisted(TEST_OFFSET - 1) + assert len(store._cache._mutations) == 0 + assert store._cache._set_mutation.call_count == TEST_OFFSET + + TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + res = store._contains(self.TEST_KEY1) + store._bigtable_set.assert_not_called() + assert store._cache._set_mutation.call_count == TEST_OFFSET + assert len(store._cache._mutations) == 1 + assert res is True From e42d4eb62a8839b2eedd2b6c73beec91656dc5fc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Apr 2023 18:04:35 +0200 Subject: [PATCH 366/616] removed mutation-buffer and tests --- faust/stores/bigtable.py | 134 +++-------------- tests/unit/stores/test_bigtable.py | 222 +++-------------------------- 2 files changed, 42 insertions(+), 314 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5297e0f0b..1490737ee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -95,7 +95,6 @@ def keys(self): class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] - _mutations: Dict[bytes, Tuple[BT.DirectRow, Optional[bytes]]] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) @@ -106,7 +105,6 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: ) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) - self._init_mutation_buffer(options) self._finished_preloads: Set[bytes] = set() def _fill_if_empty(self, bt_keys): @@ -144,13 +142,8 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): for row in self.bt_table.read_rows( row_set=row_set, filter_=row_filter ): - if row.row_key in self._mutations.keys(): - mutation_val = self._mutations[row.row_key][1] - if mutation_val is not None: - self._value_cache[row.row_key] = mutation_val - else: - value = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache[row.row_key] = value + value = BigTableStore.bigtable_exrtact_row_data(row) + self._value_cache[row.row_key] = value yield row.row_key except Exception as e: self.log.info( @@ -165,8 +158,6 @@ def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): self._finished_preloads.update(preload_ids_todo) def get(self, bt_key: bytes) -> Optional[bytes]: - if bt_key in self._mutations.keys(): - return self._mutations[bt_key][1] if self._value_cache is not None: self._fill_if_empty({bt_key}) if bt_key in self._value_cache.keys(): @@ -176,12 +167,10 @@ def get(self, bt_key: bytes) -> Optional[bytes]: def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value - self._set_mutation(bt_key, value) def delete(self, bt_key: bytes) -> None: if self._value_cache is not None: del self._value_cache[bt_key] - self._set_mutation(bt_key, None) def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -194,11 +183,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - # A mutation could be present in the buffer but not in - # in the table. - if bt_key in self._mutations: - return self._mutations[bt_key][1] is not None - if self._value_cache is not None: self._fill_if_empty({bt_key}) found = bt_key in self._value_cache.keys() @@ -209,11 +193,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - mutations = key_set.intersection(self._mutations) - if len(mutations) > 0: - found = any(self._mutations[mut][1] is not None for mut in mutations) - return found - if self._value_cache is not None: self._fill_if_empty(key_set) found = not self._value_cache.keys().isdisjoint(key_set) @@ -223,52 +202,12 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: return None - def flush_if_timer_over(self, tp: TP) -> bool: - now = time.time() - flushed = False - last_flush = self._last_flush.get(tp.partition, now - self._mut_freq) - max_reached = len(self._mutations) >= self._max_mutations - if now >= last_flush + self._mut_freq or max_reached: - mutatations_copy = self._mutations.copy() - mutatations = [ - m[0] - for m in mutatations_copy.values() - if tp.partition == m[0].row_key[0] - ] - if len(mutatations) > 0: - response = self.bt_table.mutate_rows(mutatations) - for i, status in enumerate(response): - if status.code != 0: - self.log.error(f"Row number {i} failed to write") - else: - self._mutations.pop(mutatations[i].row_key) - flushed = True - self._last_flush[tp.partition] = now - return flushed - - def _set_mutation(self, bt_key: bytes, value: Optional[bytes]): - if bt_key in self._mutations: - row = self._mutations[bt_key][0] - else: - row = self.bt_table.direct_row(bt_key) - - if value is None: - row.delete() - else: - row.set_cell( - "FaustColumnFamily", # TODO: Define this globally - "DATA", - value, - ) - self._mutations[bt_key] = row, value - def delete_partition(self, partition: int): if self._value_cache is not None: keys = set(self._value_cache.keys()) for k in keys: if k[0] == partition: del self._value_cache[k] - self._mutations.pop(k, None) self._partition_cache.pop(k[1:], None) def _init_value_cache( @@ -286,16 +225,6 @@ def _init_value_cache( self._value_cache = None self.is_complete = False - def _init_mutation_buffer(self, options): - self._mut_freq = options.get(BigTableStore.BT_MUTATION_FREQ_KEY, 0) - self._last_flush = ( - {} - ) - self._max_mutations = options.get( - BigTableStore.BT_MAX_MUTATIONS, 10000 - ) - self._mutations = {} - class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -311,8 +240,6 @@ class BigTableStore(base.SerializedStore): BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - BT_MUTATION_FREQ_KEY = "bt_mutation_freq_key" - BT_MAX_MUTATIONS = "bt_max_mutations" BT_CUSTOM_KEY_TRANSLATOR_KEY = "bt_custom_key_translator_key" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" @@ -462,27 +389,22 @@ def _bigtable_get_range( # Not found return None, None - def _bigtable_set( - self, key: bytes, value: Optional[bytes], persist_offset=False - ): - if not persist_offset: - # All mutatations set here will be flushed to BT later - self._cache.set(key, value) - else: - # Except if we want to persist the current offset - row = self.bt_table.direct_row(key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - row.commit() + def _bigtable_set(self, bt_key: bytes, value: Optional[bytes]): + self._cache.set(bt_key, value) + row = self.bt_table.direct_row(bt_key) + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) + row.commit() + + def _bigtable_del(self, bt_key: bytes): + self._cache.delete(bt_key) + row = self.bt_table.direct_row(bt_key) + row.delete() + row.commit() - def _bigtable_del(self, key: bytes): - # Just operate on the cache, - # the mutation will be commited if the - # mutations buffer is flushed - self._cache.delete(key) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -624,21 +546,11 @@ def _iterkeys(self) -> Iterator[bytes]: prefix_end = self._get_partition_prefix(partition + 1) row_set.add_row_range_from_keys(prefix_start, prefix_end) - found_mutations = set() - for k, mut in self._cache._mutations.items(): - if mut[1] is not None: - yield self._get_faust_key(k) - found_mutations.add(k) - for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): - if row.row_key in found_mutations: - continue - if self._cache._value_cache is not None: data = self.bigtable_exrtact_row_data(row) - # We don't want to set mutations here self._cache._value_cache[row.row_key] = data preload_id = self._cache._preload_id_from_key(row.row_key) self._cache._finished_preloads.add(preload_id) @@ -733,7 +645,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return None def set_persisted_offset( - self, tp: TP, offset: int, recovery=False + self, tp: TP, offset: int ) -> None: """Set the last persisted offset for this table. @@ -743,12 +655,10 @@ def set_persisted_offset( we were not an active replica. """ try: - if recovery or self._cache.flush_if_timer_over(tp): - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), persist_offset=True - ) - + offset_key = self.get_offset_key(tp).encode() + self._bigtable_set( + offset_key, str(offset).encode() + ) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index be5030947..9bab6a08e 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -196,9 +196,6 @@ def test_default__init__(self): test_manager = BigTableCacheManager(MagicMock(), {}, bigtable_mock) assert test_manager.bt_table == bigtable_mock assert test_manager._value_cache is None - assert test_manager._mut_freq == 0 - assert test_manager._last_flush == {} - assert test_manager._mutations == {} assert test_manager._finished_preloads == set() def test_iscomplete__init__(self): @@ -217,9 +214,6 @@ def test_iscomplete__init__(self): ) assert test_manager.bt_table == bigtable_mock assert isinstance(test_manager._value_cache, BigTableValueCache) - assert test_manager._mut_freq == 0 - assert test_manager._last_flush == {} - assert test_manager._mutations == {} assert test_manager._finished_preloads == set() assert test_manager.get_preload_prefix_len == get_preload_prefix_len @@ -246,7 +240,6 @@ def manager(self, bt_imports): options = { BigTableStore.VALUE_CACHE_ENABLE_KEY: True, - BigTableStore.BT_MUTATION_FREQ_KEY: 600, } manager = BigTableCacheManager( MagicMock(), options, bigtable_mock @@ -307,14 +300,6 @@ def test_fill_if_empty_with_pre_and_suffix(self, manager): assert manager._finished_preloads == {b"\x13PP", b"\x10XX"} assert manager.contains(key) - def test_fill_if_empty_with_mutation(self, manager): - key = b"\x13AAA" - manager.bt_table.add_test_data({key}) - manager._mutations = {key: (MagicMock(), "some_row_mutation")} - manager._fill_if_empty({key}) - assert manager.contains(key) - assert manager.get(key) == "some_row_mutation" - def test_get(self, manager): # Adding the key here is sufficient, because the cache gets filled key_in = b"\x13AAA" @@ -337,32 +322,26 @@ def test_get(self, manager): assert res is None def test_set(self, manager): - manager._set_mutation = MagicMock() key_1 = b"\x13AAA" key_2 = b"\x13ABB" manager.set(key_1, key_1) - manager._set_mutation.assert_called_once_with(key_1, key_1) assert manager.contains(key_1) assert manager.contains(key_2) is False manager.set(key_2, key_2) - manager._set_mutation.assert_called_with(key_2, key_2) assert manager.contains(key_1) assert manager.contains(key_2) assert manager.get(key_1) == key_1 assert manager.get(key_2) == key_2 def test_delete(self, manager): - manager._set_mutation = MagicMock() key_1 = b"\x13AAA" key_2 = b"\x13ABB" manager.set(key_1, key_1) assert manager.contains(key_1) manager.delete(key_1) - manager._set_mutation.assert_called_with(key_1, None) assert not manager.contains(key_1) manager.delete(key_2) - manager._set_mutation.assert_called_with(key_2, None) def test_partition_cache(self, manager): key = b"aaa" @@ -392,12 +371,6 @@ def test_contains(self, manager): assert manager.contains(key_not_in) is None manager._fill_if_empty.assert_not_called() - manager._mutations = {key_in: (key_in, key_in)} - assert manager.contains(key_in) is True - manager._fill_if_empty.assert_not_called() - assert manager.contains(key_not_in) is None - manager._fill_if_empty.assert_not_called() - def test_contains_any(self, manager): # Adding the key here is sufficient, because the cache gets filled key_in = b"\x13AAA" @@ -417,103 +390,6 @@ def test_contains_any(self, manager): assert manager.contains_any({key_not_in}) is None manager._fill_if_empty.assert_not_called() - manager._mutations = {key_in: (key_in, key_in)} - assert manager.contains_any({key_in, key_not_in}) is True - manager._fill_if_empty.assert_not_called() - assert manager.contains_any({key_not_in}) is None - manager._fill_if_empty.assert_not_called() - - def test_flush_if_timer_over(self, manager): - tp = TP("a_topic", partition=19) - tp2 = TP("a_topic", partition=0) - time.time = MagicMock(return_value=0) - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(404)] - ) - - row_mock = MagicMock() - row_mock.row_key = b"\x13AAA" - manager._mutations = { - row_mock.row_key: (row_mock, "some_row_mutation") - } - - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): - assert manager.flush_if_timer_over(tp) is True - assert manager._last_flush == {tp.partition: 0} - assert manager.flush_if_timer_over(tp) is False - - with patch( - "faust.stores.bigtable.time.time", - MagicMock(return_value=manager._mut_freq), - ): - assert manager.flush_if_timer_over(tp2) is True - assert manager._last_flush == { - tp2.partition: manager._mut_freq, - tp.partition: 0, - } - - assert manager.flush_if_timer_over(tp) is True - assert len(manager._mutations) == 1 # Not dropped, due to ERR. 404 - assert manager._last_flush == { - tp2.partition: manager._mut_freq, - tp.partition: manager._mut_freq, - } - assert manager.flush_if_timer_over(tp) is False - - manager._last_flush = {} - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] - ) - - with patch( - "faust.stores.bigtable.time.time", - MagicMock(return_value=manager._mut_freq), - ): - assert manager.flush_if_timer_over(tp) is True - assert manager._last_flush == {tp.partition: manager._mut_freq} - assert len(manager._mutations) == 0 - - def test_flush_if_timer_over_on_max_count(self, manager): - tp = TP("a_topic", partition=19) - row_mock = MagicMock() - row_mock.row_key = b"\x13AAA" - manager._mutations = { - row_mock.row_key: (row_mock, "some_row_mutation") - } - manager._max_mutations = 1 - manager._last_flush = {tp.partition: 999999999999} - manager.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] - ) - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): - assert manager.flush_if_timer_over(tp) is True - - def test_set_mutation(self, manager): - row_mock = MagicMock() - row_mock.delete = MagicMock() - row_mock.set_cell = MagicMock() - row_mock.row_key = b"\x13AAA" - - manager.bt_table.direct_row = MagicMock(return_value=row_mock) - - assert len(manager._mutations) == 0 - manager._set_mutation(row_mock.row_key, "new_value") - manager.bt_table.direct_row.assert_called_once_with(row_mock.row_key) - row_mock.set_cell.assert_called_once_with( - "FaustColumnFamily", "DATA", "new_value" - ) - assert manager._mutations[row_mock.row_key][1] == "new_value" - assert len(manager._mutations) == 1 - - manager._set_mutation(row_mock.row_key, None) - row_mock.delete.assert_called_once() - assert manager._mutations[row_mock.row_key][1] is None - assert len(manager._mutations) == 1 - def test_fill_if_empty_and_yield(self, manager): manager.bt_table.add_test_data({b"\x13AAA"}) @@ -539,11 +415,9 @@ def test_delete_partition(self, manager): manager.set_partition(row_mock.row_key[1:], partition) manager.delete_partition(3) assert len(manager._value_cache) == 1 - assert len(manager._mutations) == 1 assert len(manager._partition_cache) == 1 manager.delete_partition(partition) assert len(manager._value_cache) == 0 - assert len(manager._mutations) == 0 assert len(manager._partition_cache) == 0 # Delete something that does not exist yet should not do anything manager.delete_partition(999999) @@ -704,7 +578,6 @@ def test_bigtable_bigtable_get_on_empty(self, store): ) assert return_value is None - def test_bigtable_bigtable_get_cache_miss(self, store): store._cache.contains = MagicMock(return_value=None) store.bt_table.add_test_data([self.TEST_KEY1]) @@ -844,20 +717,17 @@ def test_bigtable_set(self, store): store._cache.set = MagicMock(return_value=None) store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, persist_offset=True - ) + store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store.bt_table.direct_row.assert_called_once_with(self.TEST_KEY1) - store._cache.set.assert_called_once_with( - self.TEST_KEY1, self.TEST_KEY1 - ) - row_mock.set_cell.assert_called_once_with( + store.bt_table.direct_row.assert_called_with(self.TEST_KEY1) + store._cache.set.assert_called_with(self.TEST_KEY1, self.TEST_KEY1) + row_mock.set_cell.assert_called_with( store.column_family_id, store.column_name, self.TEST_KEY1, ) - row_mock.commit.assert_called_once() + row_mock.commit.assert_called() + assert row_mock.commit.call_count == 2 def test_maybe_get_partition_from_message(self, store): event_mock = MagicMock() @@ -1057,24 +927,6 @@ def test_iterkeys(self, store): self.TEST_KEY3, ] - def test_iterkeys_with_mutattions(self, store): - store._active_partitions = MagicMock(return_value=[1, 3]) - keys_in_store = [] - k1 = store._get_bigtable_key(self.TEST_KEY1, 1) - k2 = store._get_bigtable_key(self.TEST_KEY2, 2) - k3 = store._get_bigtable_key(self.TEST_KEY3, 3) - keys_in_store.append(k1) - keys_in_store.append(k2) - keys_in_store.append(k3) - store.bt_table.add_test_data(keys_in_store) - store._cache._mutations = { - k1: (k1, None), - k3: (k3, "HAS SOME VALUE"), - } - - all_res = sorted(store._iterkeys()) - assert all_res == [self.TEST_KEY3] - def test_itervalues(self, store): keys_in_store = [] keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) @@ -1143,30 +995,11 @@ def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) store._bigtable_set = MagicMock() - - # Scenario 0: No recovery && no flush - recovery = False - store._cache.flush_if_timer_over = MagicMock(return_value=False) - expected_offset_key = store.get_offset_key(tp).encode() - store.set_persisted_offset(tp, 123, recovery=recovery) - store._bigtable_set.assert_not_called() - - # Scenario 1: Recovery - recovery = True store._cache.flush_if_timer_over = MagicMock(return_value=False) expected_offset_key = store.get_offset_key(tp).encode() - store.set_persisted_offset(tp, 123, recovery=recovery) - store._bigtable_set.assert_called_once_with( - expected_offset_key, str(123).encode(), persist_offset=True - ) - - # Scenario 2: Mutattion buffer flush - recovery = False - store._cache.flush_if_timer_over = MagicMock(return_value=True) - expected_offset_key = store.get_offset_key(tp).encode() - store.set_persisted_offset(tp, 123, recovery=recovery) + store.set_persisted_offset(tp, 123) store._bigtable_set.assert_called_with( - expected_offset_key, str(123).encode(), persist_offset=True + expected_offset_key, str(123).encode() ) def test_persist_changelog_batch(self, store): @@ -1345,9 +1178,7 @@ def __init__(self, message): tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] assert tp_offsets == {tp: 3, tp2: 4} - def test_modification_with_mutation_buffer(self, store): - # Mocks TEST_TP = TP("a", 0) TEST_OFFSET = 0 @@ -1367,20 +1198,16 @@ def real_del_scenario(key, offset): def assert_offset_persisted(offset): store._bigtable_set.assert_called_with( - OFFSET_KEY, str(offset).encode(), persist_offset=True + OFFSET_KEY, str(offset).encode() ) - row_mock = MagicMock() row_mock.row_key = b"\x00TEST_KEY1" - store._cache.bt_table.direct_row = MagicMock(return_value=row_mock) - store._cache.bt_table.mutate_rows = MagicMock( + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store.bt_table.mutate_rows = MagicMock( return_value=[MyTestResponse(0)] * 1 ) - store._cache._set_mutation = MagicMock(wraps=store._cache._set_mutation) - - time.time = MagicMock(return_value=0) store._bigtable_set = MagicMock(wraps=store._bigtable_set) partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( @@ -1388,39 +1215,30 @@ def assert_offset_persisted(offset): ) store._cache.set_partition = MagicMock() # Flush every 10 seconds - store._cache._mut_freq = 10 - store._cache._last_flush = {TEST_TP.partition: 0} res = store._contains(self.TEST_KEY1) store._bigtable_set.assert_not_called() assert res is False TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) res = store._contains(self.TEST_KEY1) + assert_offset_persisted(TEST_OFFSET - 1) assert res is True - assert store._cache._set_mutation.call_count == TEST_OFFSET - assert len(store._cache._mutations) == 1 - store._bigtable_set.assert_not_called() - TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + TEST_OFFSET = real_set_scenario( + self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET + ) res = store._contains(self.TEST_KEY1) assert res is True - assert store._cache._set_mutation.call_count == TEST_OFFSET - assert len(store._cache._mutations) == 1 - store._bigtable_set.assert_not_called() - - time.time = MagicMock(return_value=20) # Now we should flush + assert_offset_persisted(TEST_OFFSET - 1) TEST_OFFSET = real_del_scenario(self.TEST_KEY1, TEST_OFFSET) res = store._contains(self.TEST_KEY1) assert res is False - assert_offset_persisted(TEST_OFFSET - 1) - assert len(store._cache._mutations) == 0 - assert store._cache._set_mutation.call_count == TEST_OFFSET - TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + TEST_OFFSET = real_set_scenario( + self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET + ) res = store._contains(self.TEST_KEY1) - store._bigtable_set.assert_not_called() - assert store._cache._set_mutation.call_count == TEST_OFFSET - assert len(store._cache._mutations) == 1 + assert_offset_persisted(TEST_OFFSET - 1) assert res is True From 38cb345237ea491feedb20912613c145901c3a0f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Apr 2023 18:13:08 +0200 Subject: [PATCH 367/616] removed recovery flag --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1490737ee..cf17b0b51 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -673,7 +673,7 @@ def _persist_changelog_batch(self, row_mutations, tp_offsets): self.log.error("Row number {} failed to write".format(i)) for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset, recovery=True) + self.set_persisted_offset(tp, offset) def apply_changelog_batch( self, From 57e38390518b7b47e58a3f51effabf7e291cad17 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Apr 2023 18:14:48 +0200 Subject: [PATCH 368/616] fixed tests with recovery flag --- tests/unit/stores/test_bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 9bab6a08e..fa3401b28 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1026,7 +1026,7 @@ def test_persist_changelog_batch(self, store): ) assert store.set_persisted_offset.call_count == len(offset_batch) - store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) + store.set_persisted_offset.assert_called_with(tp3, 333) store.log.error.assert_not_called() # Scenario 2: all failure From 4f2e937e5c31a447cb6fdbf823722e3459278f61 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 24 Apr 2023 18:41:31 +0200 Subject: [PATCH 369/616] only return true if in cache otherwise None --- faust/stores/bigtable.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cf17b0b51..3c05d7cdd 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -186,8 +186,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: if self._value_cache is not None: self._fill_if_empty({bt_key}) found = bt_key in self._value_cache.keys() - if self.is_complete: - return found return True if found else None return None @@ -196,10 +194,7 @@ def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if self._value_cache is not None: self._fill_if_empty(key_set) found = not self._value_cache.keys().isdisjoint(key_set) - if self.is_complete: - return found return True if found else None - return None def delete_partition(self, partition: int): @@ -363,17 +358,11 @@ def _bigtable_get_range( self, keys: Set[bytes] ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: - found_delete = False for key in keys: is_cached = self._cache.contains(key) if is_cached is True: value = self._cache.get(key) return key, value - elif is_cached is False: - found_delete = True - - if found_delete: - return None, None rows = BT.RowSet() for key in keys: From e850b42be3519dfa46372a599a1561137d87111b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Apr 2023 15:16:10 +0200 Subject: [PATCH 370/616] reduced technical dept --- faust/stores/bigtable.py | 168 +++++++++----------- tests/unit/stores/test_bigtable.py | 236 +++++++---------------------- 2 files changed, 125 insertions(+), 279 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3c05d7cdd..287b28ef6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -99,67 +99,46 @@ class BigTableCacheManager: def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - self.get_preload_prefix_len = options.get( - BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY, - lambda _: 1, # Default partition only - ) self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) - self._finished_preloads: Set[bytes] = set() - - def _fill_if_empty(self, bt_keys): - # This is a hack, that enables iterating over all results - # without saving them in memory - deque(self._fill_if_empty_and_yield(bt_keys), maxlen=0) + self.filled_partitions = set() - def _preload_id_from_key(self, bt_key): - prefix = bt_key[: self.get_preload_prefix_len(bt_key)] - return prefix - - def _get_preload_rowset_and_filter(self, preload_ids): + def _get_preload_rowset(self, partitions: Set[int]): row_set = BT.RowSet() row_filter = CellsColumnLimitFilter(1) - for preload_id in preload_ids: + for partition in partitions: + preload_id = partition.to_bytes(1, "little") row_set.add_row_range_from_keys( start_key=preload_id, end_key=preload_id, end_inclusive=True ) return row_set, row_filter - def _fill_if_empty_and_yield(self, bt_keys: Set[bytes]): - preload_ids = set() - for k in bt_keys: - preload_ids.add(self._preload_id_from_key(bt_key=k)) - preload_ids_todo = preload_ids.difference(self._finished_preloads) - if len(preload_ids_todo) == 0: + def fill(self, partitions: Set[int]): + start = time.time() + partitions = partitions - self.filled_partitions + if len(partitions) == 0: return - start = time.time() if self._value_cache is not None: try: - row_set, row_filter = self._get_preload_rowset_and_filter( - preload_ids_todo - ) + row_set, row_filter = self._get_preload_rowset(partitions) for row in self.bt_table.read_rows( row_set=row_set, filter_=row_filter ): value = BigTableStore.bigtable_exrtact_row_data(row) self._value_cache[row.row_key] = value - yield row.row_key except Exception as e: - self.log.info( - f"BigTableStore fill failed for {preload_ids_todo=}, {bt_keys=}" - ) + self.log.info(f"BigTableStore fill failed for {partitions=}") raise e + self.filled_partitions.update(partitions) end = time.time() self.log.info( "BigTableStore: Finished fill for table" - f"{self.bt_table.name}:{preload_ids_todo} in {end-start}s" + f"{self.bt_table.name}:{partitions} in {end-start}s" ) - self._finished_preloads.update(preload_ids_todo) def get(self, bt_key: bytes) -> Optional[bytes]: if self._value_cache is not None: - self._fill_if_empty({bt_key}) if bt_key in self._value_cache.keys(): return self._value_cache[bt_key] return None @@ -172,6 +151,11 @@ def delete(self, bt_key: bytes) -> None: if self._value_cache is not None: del self._value_cache[bt_key] + def items(self) -> Iterable[Tuple[bytes, bytes]]: + if self._value_cache is not None: + return self._value_cache.data.items() + return [] + def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -184,16 +168,18 @@ def contains(self, bt_key: bytes) -> Optional[bool]: about the current key can be made. """ if self._value_cache is not None: - self._fill_if_empty({bt_key}) found = bt_key in self._value_cache.keys() + if self._value_cache.is_complete: + return found return True if found else None return None def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if self._value_cache is not None: - self._fill_if_empty(key_set) found = not self._value_cache.keys().isdisjoint(key_set) + if self._value_cache.is_complete: + return found return True if found else None return None @@ -235,11 +221,9 @@ class BigTableStore(base.SerializedStore): BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - BT_CUSTOM_KEY_TRANSLATOR_KEY = "bt_custom_key_translator_key" VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" VALUE_CACHE_SIZE_KEY = "value_cache_size_key" VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" - CACHE_PRELOAD_PREFIX_LEN_FUN_KEY = "cache_preload_prefix_len_fun_key" def __init__( self, @@ -254,6 +238,8 @@ def __init__( try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) + self.batcher = self.bt_table.mutations_batcher(flush_count=300) + self.commit_next_offset = False except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -265,10 +251,6 @@ def default_translator(user_key): return user_key def _set_options(self, options) -> None: - self._transform_key_to_bt, self._transform_key_from_bt = options.get( - BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY, - (self.default_translator, self.default_translator), - ) self._all_options = options self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name @@ -333,6 +315,9 @@ def _bigtable_contains(self, key: bytes) -> bool: if cache_contains is not None: return cache_contains + self.batcher.flush() + self.commit_next_offset = True + row = self.bt_table.read_row(key, filter_=self.row_filter) if row is not None: return True @@ -343,6 +328,9 @@ def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: if cache_contains is not None: return cache_contains + self.batcher.flush() + self.commit_next_offset = True + rows = BT.RowSet() for key in keys: rows.add_row_key(key) @@ -386,14 +374,13 @@ def _bigtable_set(self, bt_key: bytes, value: Optional[bytes]): self.column_name, value, ) - row.commit() + self.batcher.mutate(row) def _bigtable_del(self, bt_key: bytes): self._cache.delete(bt_key) row = self.bt_table.direct_row(bt_key) row.delete() - row.commit() - + self.batcher.mutate(row) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -411,15 +398,13 @@ def _get_partition_prefix(self, partition: int) -> bytes: return b"".join([partition_bytes]) def _get_faust_key(self, key: bytes) -> bytes: - key_with_no_partition = key[1:] - new_key = self._transform_key_from_bt(key_with_no_partition) - return new_key + faust_key = key[1:] + return faust_key def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: - key = self._transform_key_to_bt(key) prefix = self._get_partition_prefix(partition) - new_key = prefix + key - return new_key + bt_key = prefix + key + return bt_key def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: @@ -502,19 +487,24 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - row_set = BT.RowSet() - for partition in self._active_partitions(): - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - yield ( - self._get_faust_key(row.row_key), - self.bigtable_exrtact_row_data(row), - ) + if self._cache._value_cache is not None: + self._cache.fill(set(self._active_partitions())) + for key, value in self._cache.items(): + yield self._get_faust_key(key), value + else: + row_set = BT.RowSet() + for partition in self._active_partitions(): + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + yield ( + self._get_faust_key(row.row_key), + self.bigtable_exrtact_row_data(row), + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -525,33 +515,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: def _iterkeys(self) -> Iterator[bytes]: try: - start = time.time() - partitions = self._active_partitions() - - self.log.info(f"Start iterkeys for {self.table_name}") - row_set = BT.RowSet() - for partition in partitions: - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - if self._cache._value_cache is not None: - data = self.bigtable_exrtact_row_data(row) - self._cache._value_cache[row.row_key] = data - preload_id = self._cache._preload_id_from_key(row.row_key) - self._cache._finished_preloads.add(preload_id) - partition = row.row_key[0] - key = self._get_faust_key(row.row_key) - self._cache.set_partition(key, partition) - yield key - - end = time.time() - self.log.info( - f"Finished iterkeys for {self.table_name} in {end - start}s" - ) + for row in self._iteritems(): + yield row[0] except Exception as ex: self.log.error( f"FaustBigtableException Error in _iterkeys " @@ -634,7 +599,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return None def set_persisted_offset( - self, tp: TP, offset: int + self, tp: TP, offset: int, recovery: bool = False ) -> None: """Set the last persisted offset for this table. @@ -644,10 +609,14 @@ def set_persisted_offset( we were not an active replica. """ try: - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode() - ) + if recovery or self.commit_next_offset or len(self.batcher.rows) == 0: + offset_key = self.get_offset_key(tp).encode() + self._bigtable_set( + offset_key, str(offset).encode() + ) + self.batcher.flush() + self.commit_next_offset = False + except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -741,6 +710,11 @@ def revoke_partitions(self, tps: Set[TP]) -> None: self._cache.delete_partition(tp.partition) gc.collect() + def assign_partitions(self, tps: Set[TP]) -> None: + self.batcher.flush() + self.commit_next_offset = True + self._cache.fill({tp.partition for tp in tps}) + async def on_rebalance( self, assigned: Set[TP], @@ -758,7 +732,5 @@ async def on_rebalance( generation_id: the metadata generation identifier for the re-balance """ async with self._db_lock: - self.logger.info( - f"BigTableStore: Rebalancing {revoked=}, {newly_assigned=}" - ) self.revoke_partitions(revoked) + self.assign_partitions(newly_assigned) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index fa3401b28..5d394327b 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -196,7 +196,6 @@ def test_default__init__(self): test_manager = BigTableCacheManager(MagicMock(), {}, bigtable_mock) assert test_manager.bt_table == bigtable_mock assert test_manager._value_cache is None - assert test_manager._finished_preloads == set() def test_iscomplete__init__(self): bigtable_mock = BigTableMock() @@ -206,7 +205,6 @@ def test_iscomplete__init__(self): time.time = MagicMock(return_value=0) options = { BigTableStore.VALUE_CACHE_ENABLE_KEY: True, - BigTableStore.CACHE_PRELOAD_PREFIX_LEN_FUN_KEY: get_preload_prefix_len, } test_manager = BigTableCacheManager( @@ -214,8 +212,6 @@ def test_iscomplete__init__(self): ) assert test_manager.bt_table == bigtable_mock assert isinstance(test_manager._value_cache, BigTableValueCache) - assert test_manager._finished_preloads == set() - assert test_manager.get_preload_prefix_len == get_preload_prefix_len @pytest.fixture() def bt_imports(self): @@ -247,57 +243,21 @@ def manager(self, bt_imports): manager._partition_cache = {} return manager - def test_fill_if_empty(self, manager): + def test_fill(self, manager): key = b"\x13AAA" manager.bt_table.add_test_data({key}) # Scenario 1: Everything empty - manager._fill_if_empty({key}) + manager.fill({19}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13"} + assert manager.filled_partitions == {19} - manager._fill_if_empty({key}) + manager.fill({19}) assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13"} + assert manager.filled_partitions == {19} - manager._fill_if_empty({b"\x10XXX"}) + manager.fill({16}) assert manager.bt_table.read_rows.call_count == 2 - assert manager._finished_preloads == {b"\x13", b"\x10"} - assert manager.contains(key) - - def test_fill_if_empty(self, manager): - key = b"\x13AAA" - manager.bt_table.add_test_data({key}) - # Scenario 1: Everything empty - manager._fill_if_empty({key}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13"} - - manager._fill_if_empty({key}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13"} - - manager._fill_if_empty({b"\x10XXX"}) - assert manager.bt_table.read_rows.call_count == 2 - assert manager._finished_preloads == {b"\x13", b"\x10"} - assert manager.contains(key) - - def test_fill_if_empty_with_pre_and_suffix(self, manager): - manager.get_preload_prefix_len = lambda _: 3 - - key = b"\x13PPAAAAAAAA" - manager.bt_table.add_test_data({key}) - # Scenario 1: Everything empty - manager._fill_if_empty({key}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13PP"} - - manager._fill_if_empty({key}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager._finished_preloads == {b"\x13PP"} - - manager._fill_if_empty({b"\x10XXX"}) - assert manager.bt_table.read_rows.call_count == 2 - assert manager._finished_preloads == {b"\x13PP", b"\x10XX"} + assert manager.filled_partitions == {19, 16} assert manager.contains(key) def test_get(self, manager): @@ -305,20 +265,19 @@ def test_get(self, manager): key_in = b"\x13AAA" key_not_in = b"\x13BBB" manager.bt_table.add_test_data({key_in}) - manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) res = manager.get(key_in) - manager._fill_if_empty.assert_called_once_with({key_in}) + assert res is None + + manager.fill({19}) + res = manager.get(key_in) assert res == key_in res = manager.get(key_not_in) - manager._fill_if_empty.assert_called_with({key_not_in}) assert res is None - manager._fill_if_empty.reset_mock() manager._value_cache = None res = manager.get(key_in) - manager._fill_if_empty.assert_not_called() assert res is None def test_set(self, manager): @@ -357,51 +316,36 @@ def test_contains(self, manager): key_in = b"\x13AAA" key_not_in = b"\x13BBB" manager.bt_table.add_test_data({key_in}) - manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) + manager.fill({19}) assert manager.contains(key_in) is True - manager._fill_if_empty.assert_called_with({key_in}) assert manager.contains(key_not_in) is False - manager._fill_if_empty.assert_called_with({key_not_in}) + + manager._value_cache.is_complete = False + assert manager.contains(key_in) is True + assert manager.contains(key_not_in) is None manager._value_cache = None - manager._fill_if_empty.reset_mock() assert manager.contains(key_in) is None - manager._fill_if_empty.assert_not_called() assert manager.contains(key_not_in) is None - manager._fill_if_empty.assert_not_called() def test_contains_any(self, manager): # Adding the key here is sufficient, because the cache gets filled key_in = b"\x13AAA" key_not_in = b"\x13BBB" manager.bt_table.add_test_data({key_in}) - manager._fill_if_empty = MagicMock(wraps=manager._fill_if_empty) + manager.fill({19}) assert manager.contains_any({key_in, key_not_in}) is True - manager._fill_if_empty.assert_called_with({key_in, key_not_in}) assert manager.contains_any({key_not_in}) is False - manager._fill_if_empty.assert_called_with({key_not_in}) - manager._value_cache = None - manager._fill_if_empty.reset_mock() - assert manager.contains_any({key_in, key_not_in}) is None - manager._fill_if_empty.assert_not_called() + manager._value_cache.is_complete = False + assert manager.contains_any({key_in, key_not_in}) is True assert manager.contains_any({key_not_in}) is None - manager._fill_if_empty.assert_not_called() - - def test_fill_if_empty_and_yield(self, manager): - manager.bt_table.add_test_data({b"\x13AAA"}) - - res = list(manager._fill_if_empty_and_yield({b"\x13AAA"})) - manager.bt_table.read_rows.assert_called() - assert res == [b"\x13AAA"] manager._value_cache = None - manager.bt_table.read_rows.reset_mock() - res = list(manager._fill_if_empty_and_yield({b"\x13AAA"})) - assert res == [] - manager.bt_table.read_rows.assert_not_called() + assert manager.contains_any({key_in, key_not_in}) is None + assert manager.contains_any({key_not_in}) is None def test_delete_partition(self, manager): partition = 19 @@ -476,18 +420,12 @@ def from_bt_key(key): BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", BigTableStore.BT_COLUMN_NAME_KEY: "name_test", - BigTableStore.BT_CUSTOM_KEY_TRANSLATOR_KEY: ( - to_bt_key, - from_bt_key, - ), } BigTableStore._set_options(self_mock, options) assert self_mock.column_name == "name_test" assert self_mock.offset_key_prefix == "offset_test" assert self_mock.row_filter == "a_filter" assert self_mock.table_name_generator == name_lambda - assert self_mock._transform_key_to_bt == to_bt_key - assert self_mock._transform_key_from_bt == from_bt_key @pytest.mark.asyncio async def test_bigtable_setup(self, bt_imports): @@ -712,9 +650,11 @@ def test_bigtable_delete(self, store): def test_bigtable_set(self, store): row_mock = MagicMock() row_mock.set_cell = MagicMock() - row_mock.commit = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) store._cache.set = MagicMock(return_value=None) + store.batcher = MagicMock() + store.batcher.mutate = MagicMock() store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) @@ -726,8 +666,7 @@ def test_bigtable_set(self, store): store.column_name, self.TEST_KEY1, ) - row_mock.commit.assert_called() - assert row_mock.commit.call_count == 2 + assert store.batcher.mutate.call_count == 2 def test_maybe_get_partition_from_message(self, store): event_mock = MagicMock() @@ -903,14 +842,26 @@ def test_iteritems(self, store): keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) + # Fill cache + store._cache._value_cache.data = { + keys_in_store[0]: "CACHE_VALUE1", + keys_in_store[2]: "CACHE_VALUE3", + } store.bt_table.add_test_data(keys_in_store) store._active_partitions = MagicMock(return_value=[1, 3]) + store._cache.fill = MagicMock() + store.bt_table.read_rows = MagicMock(wrap=store.bt_table.read_rows) all_res = sorted(store._iteritems()) + store._cache.fill.assert_called_once_with({1, 3}) assert all_res == [ - (self.TEST_KEY1, keys_in_store[0]), - (self.TEST_KEY3, keys_in_store[2]), + (self.TEST_KEY1, "CACHE_VALUE1"), + (self.TEST_KEY3, "CACHE_VALUE3"), ] + store._cache.fill.reset_mock() + store._cache._value_cache = None + all_res = sorted(store._iteritems()) + assert store.bt_table.read_rows.call_count == 1 def test_iterkeys(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) @@ -921,6 +872,12 @@ def test_iterkeys(self, store): keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) store.bt_table.add_test_data(keys_in_store) + # Fill cache + store._cache._value_cache.data = { + keys_in_store[0]: keys_in_store[0], + keys_in_store[2]: keys_in_store[2], + } + all_res = sorted(store._iterkeys()) assert all_res == [ self.TEST_KEY1, @@ -934,6 +891,11 @@ def test_itervalues(self, store): keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) store.bt_table.add_test_data(keys_in_store) + # Fill cache + store._cache._value_cache.data = { + keys_in_store[0]: keys_in_store[0], + keys_in_store[2]: keys_in_store[2], + } store._active_partitions = MagicMock(return_value=[1, 3]) all_res = sorted(store._itervalues()) assert all_res == [keys_in_store[0], keys_in_store[2]] @@ -1090,95 +1052,7 @@ def test_revoke_partitions(self, store): store._cache.delete_partition.assert_any_call(1) store._cache.delete_partition.assert_any_call(2) - def test__fill_with_custom_key_prefix(self, store): - store._cache.get_preload_prefix_len = get_preload_prefix_len - store._transform_key_from_bt = from_bt_key - store._transform_key_to_bt = to_bt_key - - partition = 0 - for k in [self.TEST_KEY4, self.TEST_KEY5]: - res = store._get_bigtable_key(k, partition) - expected_preload_id = b"\x00" + k[-24:] - preload_id = store._cache._preload_id_from_key(res) - assert preload_id == expected_preload_id - assert res == expected_preload_id + k - assert k == store._transform_key_from_bt( - store._transform_key_to_bt(k) - ) - - res = store._get_bigtable_key(self.TEST_KEY6, partition) - expected_preload_id = b"\x00" + self.TEST_KEY6 - preload_id = store._cache._preload_id_from_key(res) - assert preload_id == expected_preload_id - assert res == expected_preload_id - assert self.TEST_KEY6 == store._transform_key_from_bt( - store._transform_key_to_bt(self.TEST_KEY6) - ) - - def test_contains_with_unknown_partition_and_key_transform(self, store): - store._cache.get_preload_prefix_len = get_preload_prefix_len - store._transform_key_from_bt = from_bt_key - store._transform_key_to_bt = to_bt_key - - store.app.conf.store_check_exists = True - store._maybe_get_partition_from_message = MagicMock(return_value=None) - store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) - store._cache.contains_any = MagicMock(wraps=store._cache.contains_any) - store._bigtable_contains_any = MagicMock( - wraps=store._bigtable_contains_any - ) - keys_to_search = set() - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 1)) - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 3)) - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY4, 19)) - - res = store._contains(self.TEST_KEY4) - res_contains = store._bigtable_contains_any.assert_called_once_with( - keys_to_search - ) - assert res_contains is None - assert res is False - - def test_apply_changelog_batch_with_key_transform(self, store): - store._transform_key_from_bt = from_bt_key - store._transform_key_to_bt = to_bt_key - - row_mock = MagicMock() - row_mock.delete = MagicMock() - row_mock.set_cell = MagicMock() - store.bt_table.direct_row = MagicMock(return_value=row_mock) - store.bt_table.mutate_rows = MagicMock() - store._persist_changelog_batch = MagicMock() - - class TestMessage: - def __init__(self, value, key, tp, offset): - self.value = value - self.key = key - self.tp = tp - self.offset = offset - - class TestEvent: - def __init__(self, message): - self.message = message - - tp = TP("a", 19) - tp2 = TP("b", 19) - messages = [ - TestEvent(TestMessage("a", self.TEST_KEY4, tp, 0)), - TestEvent(TestMessage(None, self.TEST_KEY4, tp, 1)), # Delete - TestEvent(TestMessage("a", self.TEST_KEY4, tp, 3)), # Out of order - TestEvent(TestMessage("b", self.TEST_KEY4, tp2, 4)), - TestEvent(TestMessage("a", self.TEST_KEY4, tp, 2)), - ] - store.apply_changelog_batch(messages, lambda x: x, lambda x: x) - assert store.bt_table.direct_row.call_count == 5 - row_mock.delete.assert_called_once() - assert row_mock.set_cell.call_count == 4 - store._persist_changelog_batch.assert_called_once() - tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] - assert tp_offsets == {tp: 3, tp2: 4} - - def test_modification_with_mutation_buffer(self, store): + def test_mmutation_flush(self, store): # Mocks TEST_TP = TP("a", 0) TEST_OFFSET = 0 @@ -1205,9 +1079,7 @@ def assert_offset_persisted(offset): row_mock.row_key = b"\x00TEST_KEY1" store.bt_table.direct_row = MagicMock(return_value=row_mock) - store.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] * 1 - ) + store.batcher.mutate = MagicMock(wraps=store.batcher.mutate) store._bigtable_set = MagicMock(wraps=store._bigtable_set) partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( @@ -1219,7 +1091,9 @@ def assert_offset_persisted(offset): store._bigtable_set.assert_not_called() assert res is False - TEST_OFFSET = real_set_scenario(self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET) + TEST_OFFSET = real_set_scenario( + self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET + ) res = store._contains(self.TEST_KEY1) assert_offset_persisted(TEST_OFFSET - 1) assert res is True From b55a926f6b0de2258708a755ad90f94e97dd3ab1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Apr 2023 15:50:43 +0200 Subject: [PATCH 371/616] instantly batch recovery --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 287b28ef6..e6601b7a2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -631,7 +631,7 @@ def _persist_changelog_batch(self, row_mutations, tp_offsets): self.log.error("Row number {} failed to write".format(i)) for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset) + self.set_persisted_offset(tp, offset, recovery=True) def apply_changelog_batch( self, From 0cfd7c6f47aafdd887e493db811016c2148a188f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Apr 2023 15:52:11 +0200 Subject: [PATCH 372/616] fixed tests --- tests/unit/stores/test_bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 5d394327b..81db09db6 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -988,7 +988,7 @@ def test_persist_changelog_batch(self, store): ) assert store.set_persisted_offset.call_count == len(offset_batch) - store.set_persisted_offset.assert_called_with(tp3, 333) + store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) store.log.error.assert_not_called() # Scenario 2: all failure From fbd0ce41ce24452c8189a2c6c4ffc98d1bd26476 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Apr 2023 17:10:46 +0200 Subject: [PATCH 373/616] fixed fill for end inclusive rowsets --- faust/stores/bigtable.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e6601b7a2..d77ba8eb1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -4,8 +4,17 @@ import logging import time import traceback -from collections import deque -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Set, + Tuple, + Union, +) try: # pragma: no cover from google.cloud.bigtable import column_family @@ -109,7 +118,7 @@ def _get_preload_rowset(self, partitions: Set[int]): for partition in partitions: preload_id = partition.to_bytes(1, "little") row_set.add_row_range_from_keys( - start_key=preload_id, end_key=preload_id, end_inclusive=True + start_key=preload_id, end_key=preload_id + b"\xff" ) return row_set, row_filter @@ -609,11 +618,13 @@ def set_persisted_offset( we were not an active replica. """ try: - if recovery or self.commit_next_offset or len(self.batcher.rows) == 0: + if ( + recovery + or self.commit_next_offset + or len(self.batcher.rows) == 0 + ): offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode() - ) + self._bigtable_set(offset_key, str(offset).encode()) self.batcher.flush() self.commit_next_offset = False From 03c668c446ed0b959bdbaa901af65dbfca91e77a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 25 Apr 2023 19:12:43 +0200 Subject: [PATCH 374/616] fixed contains --- faust/stores/bigtable.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d77ba8eb1..b80b5b97e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -247,7 +247,7 @@ def __init__( try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) - self.batcher = self.bt_table.mutations_batcher(flush_count=300) + self.batcher = self.bt_table.mutations_batcher(flush_count=1000) self.commit_next_offset = False except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") @@ -311,9 +311,10 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: # in the cache is either None or exists return self._cache.get(key) else: + self.batcher.flush() + self.commit_next_offset = True res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: - self.log.error(f"{key=} not found in {self.table_name}") value = None else: value = self.bigtable_exrtact_row_data(res) @@ -361,6 +362,9 @@ def _bigtable_get_range( value = self._cache.get(key) return key, value + self.batcher.flush() + self.commit_next_offset = True + rows = BT.RowSet() for key in keys: rows.add_row_key(key) @@ -553,21 +557,8 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - partition = self._maybe_get_partition_from_message() - if partition is not None: - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - return self._bigtable_contains(key_with_partition) - else: - keys_to_search = set() - for partition in self._partitions_for_key(key): - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - keys_to_search.add(key_with_partition) + return self._get(key) is not None - return self._bigtable_contains_any(keys_to_search) except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " @@ -626,6 +617,8 @@ def set_persisted_offset( offset_key = self.get_offset_key(tp).encode() self._bigtable_set(offset_key, str(offset).encode()) self.batcher.flush() + if not self.commit_next_offset: + self.log.info(f"Committed offset {offset} for {self.table.name}") self.commit_next_offset = False except Exception as e: From f2fe3a969358adad90145e3ab7545a2bda13d2a7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 26 Apr 2023 13:47:38 +0200 Subject: [PATCH 375/616] fixed delete in table --- faust/stores/bigtable.py | 149 +++++++------------ tests/unit/stores/test_bigtable.py | 223 +++++++---------------------- 2 files changed, 101 insertions(+), 271 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b80b5b97e..540ed99ac 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -82,10 +82,9 @@ def __getitem__(self, key): return res def __setitem__(self, key, value) -> None: - if value is not None: - self._maybe_ttl_clear() - if not self.ttl_over: - self.data[key] = value + self._maybe_ttl_clear() + if not self.ttl_over: + self.data[key] = value def __delitem__(self, key): self.data.pop(key, None) @@ -148,18 +147,16 @@ def fill(self, partitions: Set[int]): def get(self, bt_key: bytes) -> Optional[bytes]: if self._value_cache is not None: - if bt_key in self._value_cache.keys(): - return self._value_cache[bt_key] - return None + return self._value_cache[bt_key] + raise NotImplementedError( + f"get is not implemented for {self.__class__} with no value cache" + ) + def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value - def delete(self, bt_key: bytes) -> None: - if self._value_cache is not None: - del self._value_cache[bt_key] - def items(self) -> Iterable[Tuple[bytes, bytes]]: if self._value_cache is not None: return self._value_cache.data.items() @@ -177,20 +174,14 @@ def contains(self, bt_key: bytes) -> Optional[bool]: about the current key can be made. """ if self._value_cache is not None: - found = bt_key in self._value_cache.keys() - if self._value_cache.is_complete: - return found - return True if found else None - - return None + return bt_key in self._value_cache.keys() + return False def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if self._value_cache is not None: found = not self._value_cache.keys().isdisjoint(key_set) - if self._value_cache.is_complete: - return found - return True if found else None - return None + return found + return False def delete_partition(self, partition: int): if self._value_cache is not None: @@ -248,7 +239,6 @@ def __init__( self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) self.batcher = self.bt_table.mutations_batcher(flush_count=1000) - self.commit_next_offset = False except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -305,70 +295,35 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._cache.contains(key) is not None: - # This means that we are sure that the value - # in the cache is either None or exists - return self._cache.get(key) + def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: + if self._cache.contains(bt_key): + return self._cache.get(bt_key) else: + # We want to be sure that we don't have any pending writes self.batcher.flush() - self.commit_next_offset = True - res = self.bt_table.read_row(key, filter_=self.row_filter) + res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None else: value = self.bigtable_exrtact_row_data(res) + # Has no effect if value_cace is None + self._cache.set(bt_key, value) return value - def _bigtable_contains(self, key: bytes) -> bool: - cache_contains = self._cache.contains(key) - if cache_contains is not None: - return cache_contains - - self.batcher.flush() - self.commit_next_offset = True - - row = self.bt_table.read_row(key, filter_=self.row_filter) - if row is not None: - return True - return False - - def _bigtable_contains_any(self, keys: Set[bytes]) -> bool: - cache_contains = self._cache.contains_any(keys) - if cache_contains is not None: - return cache_contains - - self.batcher.flush() - self.commit_next_offset = True - - rows = BT.RowSet() - for key in keys: - rows.add_row_key(key) - - for _row in self.bt_table.read_rows( - row_set=rows, filter_=BT.CellsColumnLimitFilter(1) - ): - # First hit will return - return True - return False def _bigtable_get_range( - self, keys: Set[bytes] + self, bt_keys: Set[bytes] ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: - for key in keys: - is_cached = self._cache.contains(key) - if is_cached is True: - value = self._cache.get(key) - return key, value - - self.batcher.flush() - self.commit_next_offset = True - rows = BT.RowSet() - for key in keys: - rows.add_row_key(key) + for bt_key in bt_keys: + if self._cache.contains(bt_key): + value = self._cache.get(bt_key) + return bt_key, value + else: + rows.add_row_key(bt_key) + self.batcher.flush() for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): @@ -390,7 +345,7 @@ def _bigtable_set(self, bt_key: bytes, value: Optional[bytes]): self.batcher.mutate(row) def _bigtable_del(self, bt_key: bytes): - self._cache.delete(bt_key) + self._cache.set(bt_key, None) row = self.bt_table.direct_row(bt_key) row.delete() self.batcher.mutate(row) @@ -500,24 +455,29 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: + partitions_to_fill = set(self._active_partitions()) if self._cache._value_cache is not None: - self._cache.fill(set(self._active_partitions())) + partitions_to_fill -= self._cache.filled_partitions for key, value in self._cache.items(): yield self._get_faust_key(key), value - else: - row_set = BT.RowSet() - for partition in self._active_partitions(): - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - yield ( - self._get_faust_key(row.row_key), - self.bigtable_exrtact_row_data(row), - ) + if len(partitions_to_fill) == 0: + return + + row_set = BT.RowSet() + for partition in partitions_to_fill: + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + faust_key = self._get_faust_key(row.row_key) + value = self.bigtable_exrtact_row_data(row) + if self._cache._value_cache is not None: + self._cache.set(row.row_key, value) + yield faust_key, value except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -609,18 +569,8 @@ def set_persisted_offset( we were not an active replica. """ try: - if ( - recovery - or self.commit_next_offset - or len(self.batcher.rows) == 0 - ): - offset_key = self.get_offset_key(tp).encode() - self._bigtable_set(offset_key, str(offset).encode()) - self.batcher.flush() - if not self.commit_next_offset: - self.log.info(f"Committed offset {offset} for {self.table.name}") - self.commit_next_offset = False - + offset_key = self.get_offset_key(tp).encode() + self._bigtable_set(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -716,7 +666,6 @@ def revoke_partitions(self, tps: Set[TP]) -> None: def assign_partitions(self, tps: Set[TP]) -> None: self.batcher.flush() - self.commit_next_offset = True self._cache.fill({tp.partition for tp in tps}) async def on_rebalance( diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 81db09db6..12f12cf33 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -261,24 +261,23 @@ def test_fill(self, manager): assert manager.contains(key) def test_get(self, manager): - # Adding the key here is sufficient, because the cache gets filled key_in = b"\x13AAA" key_not_in = b"\x13BBB" - manager.bt_table.add_test_data({key_in}) - res = manager.get(key_in) - assert res is None + manager.bt_table.add_test_data({key_in}) + with pytest.raises(KeyError): + manager.get(key_in) manager.fill({19}) res = manager.get(key_in) assert res == key_in - res = manager.get(key_not_in) - assert res is None + with pytest.raises(KeyError): + manager.get(key_not_in) manager._value_cache = None - res = manager.get(key_in) - assert res is None + with pytest.raises(NotImplementedError): + manager.get(key_in) def test_set(self, manager): key_1 = b"\x13AAA" @@ -293,15 +292,6 @@ def test_set(self, manager): assert manager.get(key_1) == key_1 assert manager.get(key_2) == key_2 - def test_delete(self, manager): - key_1 = b"\x13AAA" - key_2 = b"\x13ABB" - manager.set(key_1, key_1) - assert manager.contains(key_1) - manager.delete(key_1) - assert not manager.contains(key_1) - manager.delete(key_2) - def test_partition_cache(self, manager): key = b"aaa" with pytest.raises(KeyError): @@ -323,11 +313,11 @@ def test_contains(self, manager): manager._value_cache.is_complete = False assert manager.contains(key_in) is True - assert manager.contains(key_not_in) is None + assert manager.contains(key_not_in) is False manager._value_cache = None - assert manager.contains(key_in) is None - assert manager.contains(key_not_in) is None + assert manager.contains(key_in) is False + assert manager.contains(key_not_in) is False def test_contains_any(self, manager): # Adding the key here is sufficient, because the cache gets filled @@ -341,11 +331,11 @@ def test_contains_any(self, manager): manager._value_cache.is_complete = False assert manager.contains_any({key_in, key_not_in}) is True - assert manager.contains_any({key_not_in}) is None + assert manager.contains_any({key_not_in}) is False manager._value_cache = None - assert manager.contains_any({key_in, key_not_in}) is None - assert manager.contains_any({key_not_in}) is None + assert manager.contains_any({key_in, key_not_in}) is False + assert manager.contains_any({key_not_in}) is False def test_delete_partition(self, manager): partition = 19 @@ -499,13 +489,20 @@ def store(self, bt_imports): "bigtable://", MagicMock(), MagicMock(), options=options ) store.bt_table = BigTableMock() + store.batcher.mutate = MagicMock(wraps=store.batcher.mutate) + store.batcher.flush = MagicMock(wraps=store.batcher.flush) return store def test_bigtable_bigtable_get_on_empty(self, store): store._cache.contains = MagicMock(return_value=False) + store._cache.set = MagicMock() return_value = store._bigtable_get(self.TEST_KEY1) store._cache.contains.assert_called_with(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() + store.batcher.flush.assert_called_once() + store.bt_table.read_row.assert_called_once_with( + self.TEST_KEY1, filter_="a_filter" + ) + store._cache.set.assert_called_with(self.TEST_KEY1, None) assert return_value is None store._cache.contains = MagicMock(return_value=None) @@ -535,11 +532,13 @@ def test_bigtable_bigtable_get_cache_hit(self, store): assert return_value == b"cache_res" store._cache.contains = MagicMock(return_value=False) - store._cache.get = MagicMock(return_value=b"cache_res") + store._cache.get = MagicMock() return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_once_with(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() - assert return_value == b"cache_res" + store._cache.get.assert_not_called() + store.bt_table.read_row.assert_called_once_with( + self.TEST_KEY1, filter_="a_filter" + ) + assert return_value == self.TEST_KEY1 def test_bigtable_get_range_cache_miss(self, store): store._cache.contains = MagicMock(return_value=None) @@ -579,73 +578,15 @@ def test_bigtable_get_range_cache_hit(self, store): store.bt_table.read_rows.assert_not_called assert result_value == (self.TEST_KEY1, "cache_res") - def test_bigtable_contains(self, store): - store._cache.contains = MagicMock(return_value=None) - - store.bt_table.add_test_data([self.TEST_KEY1]) - return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value is True - - return_value = store._bigtable_contains(self.TEST_KEY2) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY2, filter_="a_filter" - ) - - store.bt_table.read_row.reset_mock() - - store._cache.contains = MagicMock(return_value=True) - return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() - assert return_value is True - - store._cache.contains = MagicMock(return_value=False) - return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() - assert return_value is False - - store._cache.contains = MagicMock(return_value=None) - return_value = store._bigtable_contains(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value is True - - def test_bigtable_contains_any(self, store): - store.bt_table.add_test_data([self.TEST_KEY1]) - store._cache.contains_any = MagicMock(return_value=None) - - test_keys_in = {self.TEST_KEY1, self.TEST_KEY3} - test_keys_not_in = { - self.TEST_KEY2, - } - - return_value = store._bigtable_contains_any(test_keys_not_in) - store.bt_table.read_rows.assert_called() - store.bt_table.read_rows.reset_mock() - assert return_value is False - - return_value = store._bigtable_contains_any(test_keys_in) - store.bt_table.read_rows.assert_called() - store.bt_table.read_rows.reset_mock() - assert return_value is True - - store._cache.contains_any = MagicMock(return_value=True) - return_value = store._bigtable_contains_any(test_keys_not_in) - store.bt_table.read_rows.assert_not_called() - assert return_value == store._cache.contains_any() - def test_bigtable_delete(self, store): row_mock = MagicMock() row_mock.commit = MagicMock() row_mock.delete = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._cache.delete = MagicMock(return_value=None) + store._cache.set = MagicMock() store._bigtable_del(self.TEST_KEY1) - store._cache.delete.assert_called_once_with(self.TEST_KEY1) + store._cache.set.assert_called_once_with(self.TEST_KEY1, None) def test_bigtable_set(self, store): row_mock = MagicMock() @@ -839,110 +780,51 @@ def test_active_partitions(self, store): def test_iteritems(self, store): keys_in_store = [] - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) + k1 = store._get_bigtable_key(self.TEST_KEY1, 1) + k3 = store._get_bigtable_key(self.TEST_KEY3, 3) # Fill cache - store._cache._value_cache.data = { - keys_in_store[0]: "CACHE_VALUE1", - keys_in_store[2]: "CACHE_VALUE3", + cache_data = { + k1: "CACHE_VALUE1", + k3: "CACHE_VALUE3", } + store._cache._value_cache.data = cache_data store.bt_table.add_test_data(keys_in_store) store._active_partitions = MagicMock(return_value=[1, 3]) store._cache.fill = MagicMock() - store.bt_table.read_rows = MagicMock(wrap=store.bt_table.read_rows) + store.bt_table.read_rows = MagicMock() + all_res = sorted(store._iteritems()) - store._cache.fill.assert_called_once_with({1, 3}) + store._cache.fill.assert_not_called() assert all_res == [ (self.TEST_KEY1, "CACHE_VALUE1"), (self.TEST_KEY3, "CACHE_VALUE3"), ] store._cache.fill.reset_mock() - store._cache._value_cache = None all_res = sorted(store._iteritems()) - assert store.bt_table.read_rows.call_count == 1 - - def test_iterkeys(self, store): - store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache._partition_cache.limit = 3 - keys_in_store = [] - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) - store.bt_table.add_test_data(keys_in_store) + assert store.bt_table.read_rows.call_count == 2 - # Fill cache - store._cache._value_cache.data = { - keys_in_store[0]: keys_in_store[0], - keys_in_store[2]: keys_in_store[2], - } + store._cache.filled_partitions = {1, 3} + store._active_partitions = MagicMock(return_value={1, 3}) + all_res = sorted(store._iteritems()) + # No new calls, we just return what's in the cache + assert store.bt_table.read_rows.call_count == 2 + def test_iterkeys(self, store): + values = [("K1", "V1"), ("K2", "V2")] + store._iteritems = MagicMock(return_value=values) all_res = sorted(store._iterkeys()) - assert all_res == [ - self.TEST_KEY1, - self.TEST_KEY3, - ] + assert all_res == ["K1", "K2"] def test_itervalues(self, store): - keys_in_store = [] - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY1, 1)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY2, 2)) - keys_in_store.append(store._get_bigtable_key(self.TEST_KEY3, 3)) - - store.bt_table.add_test_data(keys_in_store) - # Fill cache - store._cache._value_cache.data = { - keys_in_store[0]: keys_in_store[0], - keys_in_store[2]: keys_in_store[2], - } - store._active_partitions = MagicMock(return_value=[1, 3]) + values = [("K1", "V1"), ("K2", "V2")] + store._iteritems = MagicMock(return_value=values) all_res = sorted(store._itervalues()) - assert all_res == [keys_in_store[0], keys_in_store[2]] + assert all_res == ["V1", "V2"] def test_size(self, store): assert 0 == store._size() - def test_contains_without_store_check_exists(self, store): - store._bigtable_contains = MagicMock() - store._bigtable_contains_any = MagicMock() - store.app.conf.store_check_exists = False - - res = store._contains(self.TEST_KEY1) - - assert res is True - store._bigtable_contains_any.assert_not_called() - store._bigtable_contains.assert_not_called() - - def test_contains_with_known_partition(self, store): - store.app.conf.store_check_exists = True - store._bigtable_contains_any = MagicMock() - store._maybe_get_partition_from_message = MagicMock(return_value=19) - - # Scenario1: Found - store._bigtable_contains = MagicMock(return_value="TRUE_OR_FALSE") - key_w_partition = store._get_bigtable_key(self.TEST_KEY1, 19) - res = store._contains(self.TEST_KEY1) - store._bigtable_contains.assert_called_once_with(key_w_partition) - assert res == "TRUE_OR_FALSE" - - def test_contains_with_unknown_partition(self, store): - store.app.conf.store_check_exists = True - store._bigtable_contains_any = MagicMock() - store._maybe_get_partition_from_message = MagicMock(return_value=None) - store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) - - store._bigtable_contains_any = MagicMock(return_value="TRUE_OR_FALSE") - keys_to_search = set() - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 1)) - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 3)) - keys_to_search.add(store._get_bigtable_key(self.TEST_KEY1, 19)) - - res = store._contains(self.TEST_KEY1) - - store._bigtable_contains_any.assert_called_once_with(keys_to_search) - assert res == "TRUE_OR_FALSE" - def test_get_offset_key(self, store): tp = TP("AAAA", 19) assert store.get_offset_key(tp)[-2:] == "19" @@ -1052,7 +934,7 @@ def test_revoke_partitions(self, store): store._cache.delete_partition.assert_any_call(1) store._cache.delete_partition.assert_any_call(2) - def test_mmutation_flush(self, store): + def test_mutation_flush(self, store): # Mocks TEST_TP = TP("a", 0) TEST_OFFSET = 0 @@ -1086,7 +968,6 @@ def assert_offset_persisted(offset): return_value=partition ) store._cache.set_partition = MagicMock() - # Flush every 10 seconds res = store._contains(self.TEST_KEY1) store._bigtable_set.assert_not_called() assert res is False From 0fd98dd226ab30ae2dcf2cb00eeac02d638636ad Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 26 Apr 2023 15:10:04 +0200 Subject: [PATCH 376/616] fixed iteritems --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 540ed99ac..2dbe9c075 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -459,7 +459,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: if self._cache._value_cache is not None: partitions_to_fill -= self._cache.filled_partitions for key, value in self._cache.items(): - yield self._get_faust_key(key), value + if value is not None: + yield self._get_faust_key(key), value if len(partitions_to_fill) == 0: return From 8eff6631af44f12596993d034cbe644a5ac1daba Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 26 Apr 2023 15:32:48 +0200 Subject: [PATCH 377/616] added logging for animals of one organisation --- faust/stores/bigtable.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2dbe9c075..cbc428af8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -152,7 +152,6 @@ def get(self, bt_key: bytes) -> Optional[bytes]: f"get is not implemented for {self.__class__} with no value cache" ) - def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value @@ -310,6 +309,16 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: self._cache.set(bt_key, value) return value + def __getitem__(self, key: bytes) -> bytes: + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GET ", key) + self.log.info("ENCODED ", self._encode_key(key)) + value = self._get(self._encode_key(key)) + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GOT ", key, value) + if value is None: + raise KeyError(key) + return self._decode_value(value) def _bigtable_get_range( self, bt_keys: Set[bytes] From 0fd055221a39ac923fcf8ee7f9d83e0a42130aed Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 26 Apr 2023 16:09:47 +0200 Subject: [PATCH 378/616] added logs --- faust/stores/bigtable.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cbc428af8..e68a93bf1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -300,6 +300,8 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: else: # We want to be sure that we don't have any pending writes self.batcher.flush() + if b"6274106275ced82d58f3c3be" in bt_key: + self.log.info(f"Reading {bt_key=} from BigTable") res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -309,17 +311,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: self._cache.set(bt_key, value) return value - def __getitem__(self, key: bytes) -> bytes: - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GET ", key) - self.log.info("ENCODED ", self._encode_key(key)) - value = self._get(self._encode_key(key)) - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GOT ", key, value) - if value is None: - raise KeyError(key) - return self._decode_value(value) - def _bigtable_get_range( self, bt_keys: Set[bytes] ) -> Tuple[Optional[bytes], Optional[bytes]]: @@ -391,6 +382,9 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: + + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GET ", key) partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_bigtable_key( @@ -400,6 +394,8 @@ def _get(self, key: bytes) -> Optional[bytes]: value = self._bigtable_get(key_with_partition) if value is not None: self._cache.set_partition(key, partition) + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GOT ", key, value is not None) return value else: keys = set() @@ -413,7 +409,12 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: partition = key_with_partition[0] self._cache.set_partition(key, partition) + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GOT ", key, value is not None) return value + + if b"6274106275ced82d58f3c3be" in key: + self.log.info("GOT ", key, False) return None except Exception as ex: self.log.error( From 10c5f7c36e5cd1987caa6a5330741dabf8a82227 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 26 Apr 2023 16:45:21 +0200 Subject: [PATCH 379/616] added logging and fixed tests --- faust/stores/bigtable.py | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e68a93bf1..ec3c3a490 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -300,8 +300,7 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: else: # We want to be sure that we don't have any pending writes self.batcher.flush() - if b"6274106275ced82d58f3c3be" in bt_key: - self.log.info(f"Reading {bt_key=} from BigTable") + self.log.info(f"Reading {bt_key=} from BigTable") res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -383,8 +382,7 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GET ", key) + self.log.info("GET ", key) partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_bigtable_key( @@ -394,8 +392,7 @@ def _get(self, key: bytes) -> Optional[bytes]: value = self._bigtable_get(key_with_partition) if value is not None: self._cache.set_partition(key, partition) - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GOT ", key, value is not None) + self.log.info("GOT ", key, value is not None) return value else: keys = set() @@ -409,12 +406,10 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: partition = key_with_partition[0] self._cache.set_partition(key, partition) - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GOT ", key, value is not None) + self.log.info("GOT ", key, value is not None) return value - if b"6274106275ced82d58f3c3be" in key: - self.log.info("GOT ", key, False) + self.log.info("GOT ", key, False) return None except Exception as ex: self.log.error( @@ -614,30 +609,21 @@ def apply_changelog_batch( of a changelog event. """ tp_offsets: Dict[TP, int] = {} - row_mutations = [] for event in batch: tp, offset = event.message.tp, event.message.offset tp_offsets[tp] = ( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - offset_key = self._get_bigtable_key( - msg.key, partition=tp.partition - ) - row = self.bt_table.direct_row(offset_key) + bt_key = self._get_bigtable_key(msg.key, partition=tp.partition) if msg.value is None: - row.delete() + self._bigtable_del(bt_key) else: - row.set_cell( - self.column_family_id, - self.column_name, - msg.value, - ) - row_mutations.append(row) - self._persist_changelog_batch( - row_mutations, - tp_offsets, - ) + self._bigtable_set(bt_key, msg.value) + + for tp, offset in tp_offsets.items(): + self.set_persisted_offset(tp, offset) + async def backup_partition( self, From bb502df01958fe6fa89ee96fb256ad12a7924c4d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 06:58:54 +0200 Subject: [PATCH 380/616] retutn correct offset --- faust/stores/bigtable.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ec3c3a490..e7306a855 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -381,7 +381,6 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - self.log.info("GET ", key) partition = self._maybe_get_partition_from_message() if partition is not None: @@ -558,10 +557,9 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp) - row_res = self.bt_table.read_row(offset_key, filter_=self.row_filter) - if row_res is not None: - offset = int(self.bigtable_exrtact_row_data(row_res)) - return offset + offset = self._bigtable_get(offset_key) + if offset is not None: + return int(offset) return None def set_persisted_offset( From 8e96922a8d713ee93daec946562d2cd972ab4aaa Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 06:59:27 +0200 Subject: [PATCH 381/616] removed additional plus --- tests/unit/stores/test_bigtable.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 12f12cf33..2a583a8c6 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -20,7 +20,7 @@ def to_bt_key(key): if len_prefix + 1 + len_first_id + 1 >= len_total: # This happens if there is e.g. no organisation id return key - len_second_id = key[len_prefix + 1 + +len_first_id + 1] // 2 + len_second_id = key[len_prefix + 1 + len_first_id + 1] // 2 key_prefix = key[len_total - len_second_id :] return key_prefix + key @@ -893,7 +893,7 @@ def test_apply_changelog_batch(self, store): row_mock.set_cell = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) store.bt_table.mutate_rows = MagicMock() - store._persist_changelog_batch = MagicMock() + store.set_persisted_offset = MagicMock() class TestMessage: def __init__(self, value, key, tp, offset): @@ -919,9 +919,7 @@ def __init__(self, message): assert store.bt_table.direct_row.call_count == 5 row_mock.delete.assert_called_once() assert row_mock.set_cell.call_count == 4 - store._persist_changelog_batch.assert_called_once() - tp_offsets = store._persist_changelog_batch.call_args_list[0].args[1] - assert tp_offsets == {tp: 3, tp2: 4} + assert store.set_persisted_offset.call_count == 2 def test_revoke_partitions(self, store): store._cache.delete_partition = MagicMock() From 496901b835b9dba39d5430879b89aba3cf5a19b8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 07:11:21 +0200 Subject: [PATCH 382/616] removed logs --- faust/stores/bigtable.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e7306a855..97fb77f95 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -300,7 +300,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: else: # We want to be sure that we don't have any pending writes self.batcher.flush() - self.log.info(f"Reading {bt_key=} from BigTable") res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -381,7 +380,6 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - self.log.info("GET ", key) partition = self._maybe_get_partition_from_message() if partition is not None: key_with_partition = self._get_bigtable_key( @@ -391,7 +389,6 @@ def _get(self, key: bytes) -> Optional[bytes]: value = self._bigtable_get(key_with_partition) if value is not None: self._cache.set_partition(key, partition) - self.log.info("GOT ", key, value is not None) return value else: keys = set() @@ -405,10 +402,8 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: partition = key_with_partition[0] self._cache.set_partition(key, partition) - self.log.info("GOT ", key, value is not None) return value - self.log.info("GOT ", key, False) return None except Exception as ex: self.log.error( From 32261ee19970f72a90cfe1a92845099012e8c0b6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 07:53:11 +0200 Subject: [PATCH 383/616] put flush in own function --- faust/stores/bigtable.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 97fb77f95..e0e2222be 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -299,7 +299,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: return self._cache.get(bt_key) else: # We want to be sure that we don't have any pending writes - self.batcher.flush() res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -321,7 +320,7 @@ def _bigtable_get_range( else: rows.add_row_key(bt_key) - self.batcher.flush() + self._flush_mutations() for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): @@ -551,7 +550,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: """Return the last persisted offset. See :meth:`set_persisted_offset`. """ - offset_key = self.get_offset_key(tp) + offset_key = self.get_offset_key(tp).encode() offset = self._bigtable_get(offset_key) if offset is not None: return int(offset) @@ -654,8 +653,12 @@ def revoke_partitions(self, tps: Set[TP]) -> None: self._cache.delete_partition(tp.partition) gc.collect() + def _flush_mutations(self): + if self.batcher.total_size > 0: + self.batcher.flush() + def assign_partitions(self, tps: Set[TP]) -> None: - self.batcher.flush() + self._flush_mutations() self._cache.fill({tp.partition for tp in tps}) async def on_rebalance( From 6d9211c6fe179903bae58902f99d185729a8aa40 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 07:54:22 +0200 Subject: [PATCH 384/616] flush on revoked partition --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e0e2222be..732fbdc5e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -650,6 +650,7 @@ def revoke_partitions(self, tps: Set[TP]) -> None: be serving data for. """ for tp in tps: + self._flush_mutations() self._cache.delete_partition(tp.partition) gc.collect() From 67137b358d61fbab43cef561d0ed8b1600f406ca Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 08:54:53 +0200 Subject: [PATCH 385/616] removed offsets from cache --- faust/stores/bigtable.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 732fbdc5e..24e899c63 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -551,14 +551,13 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset = self._bigtable_get(offset_key) - if offset is not None: - return int(offset) - return None + row = self.bt_table.read_row(offset_key, filter_=self.row_filter) + if row is None: + return None + else: + return int(self.bigtable_exrtact_row_data(row)) - def set_persisted_offset( - self, tp: TP, offset: int, recovery: bool = False - ) -> None: + def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. This will remember the last offset that we wrote to BigTableStore, @@ -568,7 +567,14 @@ def set_persisted_offset( """ try: offset_key = self.get_offset_key(tp).encode() - self._bigtable_set(offset_key, str(offset).encode()) + row = self.bt_table.direct_row(offset_key) + row.set_cell( + self.column_family_id, + self.column_name, + str(offset).encode(), + ) + self.batcher.mutate(row) + except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -583,7 +589,7 @@ def _persist_changelog_batch(self, row_mutations, tp_offsets): self.log.error("Row number {} failed to write".format(i)) for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset, recovery=True) + self.set_persisted_offset(tp, offset) def apply_changelog_batch( self, @@ -616,7 +622,6 @@ def apply_changelog_batch( for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) - async def backup_partition( self, tp: Union[TP, int], From 9dd45b11c9d76fa7ac5719534cdc0fbe48cc9209 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 08:56:44 +0200 Subject: [PATCH 386/616] removed some functions --- faust/stores/bigtable.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 24e899c63..c085f2494 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -582,15 +582,6 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: f"TRACEBACK: {traceback.format_exc()}" ) - def _persist_changelog_batch(self, row_mutations, tp_offsets): - response = self.bt_table.mutate_rows(row_mutations) - for i, status in enumerate(response): - if status.code != 0: - self.log.error("Row number {} failed to write".format(i)) - - for tp, offset in tp_offsets.items(): - self.set_persisted_offset(tp, offset) - def apply_changelog_batch( self, batch: Iterable[EventT], From 9e98cb186e0184f1d5a724d5bb2614338f82c7d4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 09:30:06 +0200 Subject: [PATCH 387/616] added value to check exception on set --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c085f2494..4ca366c1a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -421,8 +421,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " - f"table {self.table_name} exception {ex} key {key} " - f"Traceback: {traceback.format_exc()}" + f"table {self.table_name} exception {ex} key {key=} " + f"{value=} Traceback: {traceback.format_exc()}" ) raise ex From f43b84ca51ce452da1d8cd008e14e221fee7fc41 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 09:52:03 +0200 Subject: [PATCH 388/616] turn off batcher --- faust/stores/bigtable.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4ca366c1a..e50881357 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -320,7 +320,7 @@ def _bigtable_get_range( else: rows.add_row_key(bt_key) - self._flush_mutations() + # self._flush_mutations() for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): @@ -339,13 +339,13 @@ def _bigtable_set(self, bt_key: bytes, value: Optional[bytes]): self.column_name, value, ) - self.batcher.mutate(row) + row.commit() def _bigtable_del(self, bt_key: bytes): self._cache.set(bt_key, None) row = self.bt_table.direct_row(bt_key) row.delete() - self.batcher.mutate(row) + row.commit() def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -573,7 +573,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: self.column_name, str(offset).encode(), ) - self.batcher.mutate(row) + row.commit() except Exception as e: self.log.error( @@ -646,7 +646,7 @@ def revoke_partitions(self, tps: Set[TP]) -> None: be serving data for. """ for tp in tps: - self._flush_mutations() + # self._flush_mutations() self._cache.delete_partition(tp.partition) gc.collect() @@ -655,7 +655,7 @@ def _flush_mutations(self): self.batcher.flush() def assign_partitions(self, tps: Set[TP]) -> None: - self._flush_mutations() + # self._flush_mutations() self._cache.fill({tp.partition for tp in tps}) async def on_rebalance( From f5af3065659ce2918e885dd4d3012b28dd2adcbe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 27 Apr 2023 11:50:33 +0200 Subject: [PATCH 389/616] added batcher again with different mutation approach --- faust/stores/bigtable.py | 67 +++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e50881357..44deb57aa 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -103,6 +103,7 @@ def keys(self): class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] + _mutation_cache: Dict[bytes, Optional[bytes]] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) @@ -121,6 +122,19 @@ def _get_preload_rowset(self, partitions: Set[int]): ) return row_set, row_filter + def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: + self._mutation_cache[bt_key] = value + + def has_mutation(self, bt_key) -> bool: + return bt_key in self._mutation_cache + + def get_mutation(self, bt_key: bytes) -> Optional[bytes]: + return self._mutation_cache.get(bt_key) + + def clear_mutations(self) -> None: + self._mutation_cache.clear() + + def fill(self, partitions: Set[int]): start = time.time() partitions = partitions - self.filled_partitions @@ -146,6 +160,8 @@ def fill(self, partitions: Set[int]): ) def get(self, bt_key: bytes) -> Optional[bytes]: + if self.has_mutation(bt_key): + return self.get_mutation(bt_key) if self._value_cache is not None: return self._value_cache[bt_key] raise NotImplementedError( @@ -172,11 +188,15 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ + if self.has_mutation(bt_key): + return True if self._value_cache is not None: return bt_key in self._value_cache.keys() return False def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: + if not self._mutation_cache.keys().isdisjoint(key_set): + return True if self._value_cache is not None: found = not self._value_cache.keys().isdisjoint(key_set) return found @@ -237,7 +257,7 @@ def __init__( try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) - self.batcher = self.bt_table.mutations_batcher(flush_count=1000) + self.batcher = self.bt_table.mutations_batcher(flush_count=5000) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -298,7 +318,7 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: if self._cache.contains(bt_key): return self._cache.get(bt_key) else: - # We want to be sure that we don't have any pending writes + res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -314,9 +334,13 @@ def _bigtable_get_range( # first search cache: rows = BT.RowSet() for bt_key in bt_keys: + if self._cache.has_mutation(bt_key): + return bt_key, self._cache.get_mutation(bt_key) + if self._cache.contains(bt_key): value = self._cache.get(bt_key) return bt_key, value + else: rows.add_row_key(bt_key) @@ -331,21 +355,23 @@ def _bigtable_get_range( # Not found return None, None - def _bigtable_set(self, bt_key: bytes, value: Optional[bytes]): + def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): + self._cache.submit_mutation(bt_key, value) self._cache.set(bt_key, value) - row = self.bt_table.direct_row(bt_key) - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - row.commit() - def _bigtable_del(self, bt_key: bytes): - self._cache.set(bt_key, None) row = self.bt_table.direct_row(bt_key) - row.delete() - row.commit() + if value is None: + row.delete() + else: + row.set_cell( + self.column_family_id, + self.column_name, + value, + ) + self.batcher.mutate(row) + if self.batcher.total_mutation_count == 0: + self.log.info("Flushed mutations") + self._cache.clear_mutations() def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -416,7 +442,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: key_with_partition = self._get_bigtable_key( key, partition=partition ) - self._bigtable_set(key_with_partition, value) + self._bigtable_mutate(key_with_partition, value) self._cache.set_partition(key, partition) except Exception as ex: self.log.error( @@ -432,7 +458,7 @@ def _del(self, key: bytes) -> None: key_with_partition = self._get_bigtable_key( key, partition=partition ) - self._bigtable_del(key_with_partition) + self._bigtable_mutate(key_with_partition, None) self._cache._partition_cache.pop(key, None) except Exception as ex: self.log.error( @@ -573,7 +599,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: self.column_name, str(offset).encode(), ) - row.commit() + self.batcher.mutate(row) except Exception as e: self.log.error( @@ -605,10 +631,7 @@ def apply_changelog_batch( ) msg = event.message bt_key = self._get_bigtable_key(msg.key, partition=tp.partition) - if msg.value is None: - self._bigtable_del(bt_key) - else: - self._bigtable_set(bt_key, msg.value) + self._bigtable_mutate(bt_key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) @@ -646,7 +669,7 @@ def revoke_partitions(self, tps: Set[TP]) -> None: be serving data for. """ for tp in tps: - # self._flush_mutations() + self.batcher.flush() self._cache.delete_partition(tp.partition) gc.collect() From e747b54a932aff0cd07f897e8d4e328f2c2329df Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 10:37:52 +0200 Subject: [PATCH 390/616] revival of the mutation buffer --- faust/stores/bigtable.py | 105 +++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 44deb57aa..d525cd18b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -53,6 +53,9 @@ def get_current_partition(): return event.message.partition +COLUMN_FAMILY_ID = "FaustColumnFamily" +COLUMN_NAME = "DATA" + class BigTableValueCache: """ This is a dictionary which is only filled once, after that, every @@ -70,7 +73,6 @@ def __init__(self, ttl=-1, size: Optional[int] = None) -> None: self.ttl = ttl self.ttl_over = False self.init_ts = int(time.time()) - self.is_complete = (ttl == -1) and (size is None) def __len__(self): return len(self.data) @@ -103,7 +105,8 @@ def keys(self): class BigTableCacheManager: _partition_cache: LRUCache[bytes, int] _value_cache: Optional[BigTableValueCache] - _mutation_cache: Dict[bytes, Optional[bytes]] + _mutation_values: Dict[bytes, Optional[bytes]] + _mutation_rows: Dict[bytes, BT.DirectRow] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) @@ -111,6 +114,8 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self.filled_partitions = set() + self._last_flush = time.time() + self.total_mutation_count = 0 def _get_preload_rowset(self, partitions: Set[int]): row_set = BT.RowSet() @@ -123,17 +128,38 @@ def _get_preload_rowset(self, partitions: Set[int]): return row_set, row_filter def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: - self._mutation_cache[bt_key] = value - - def has_mutation(self, bt_key) -> bool: - return bt_key in self._mutation_cache - - def get_mutation(self, bt_key: bytes) -> Optional[bytes]: - return self._mutation_cache.get(bt_key) - - def clear_mutations(self) -> None: - self._mutation_cache.clear() - + row, _ = self.get_mutation(bt_key) + row = row if row else self.bt_table.direct_row(bt_key) + if value is None: + row.delete() + else: + row.set_cell( + COLUMN_FAMILY_ID, + COLUMN_NAME, + value, + ) + self._mutation_values[bt_key] = value + self._mutation_rows[bt_key] = row + self.total_mutation_count += 1 + self.flush_mutations_if_timer_over_or_full() + + def get_mutation( + self, bt_key: bytes + ) -> Tuple[Optional[BT.DirectRow], Optional[bytes]]: + row = self._mutation_rows.get(bt_key, None) + value = self._mutation_values.get(bt_key, None) + return row, value + + def flush_mutations_if_timer_over_or_full(self) -> None: + five_min = 5 * 60 + if ( + self._last_flush + five_min < time.time() + or self.total_mutation_count > 10_000 + ): + self.bt_table.mutate_rows(self._mutation_rows.values()) + self._mutation_values.clear() + self._mutation_rows.clear() + self.total_mutation_count = 0 def fill(self, partitions: Set[int]): start = time.time() @@ -160,8 +186,8 @@ def fill(self, partitions: Set[int]): ) def get(self, bt_key: bytes) -> Optional[bytes]: - if self.has_mutation(bt_key): - return self.get_mutation(bt_key) + if self._mutation_rows.get(bt_key) is not None: + return self._mutation_values[bt_key] if self._value_cache is not None: return self._value_cache[bt_key] raise NotImplementedError( @@ -173,6 +199,7 @@ def set(self, bt_key: bytes, value: Optional[bytes]) -> None: self._value_cache[bt_key] = value def items(self) -> Iterable[Tuple[bytes, bytes]]: + # Will very likely remove this method in the future if self._value_cache is not None: return self._value_cache.data.items() return [] @@ -188,18 +215,18 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self.has_mutation(bt_key): + if self._mutation_rows.get(bt_key, None) is not None: return True if self._value_cache is not None: return bt_key in self._value_cache.keys() return False def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - if not self._mutation_cache.keys().isdisjoint(key_set): + if not self._mutation_rows.keys().isdisjoint(key_set): return True if self._value_cache is not None: - found = not self._value_cache.keys().isdisjoint(key_set) - return found + if not self._value_cache.keys().isdisjoint(key_set): + return True return False def delete_partition(self, partition: int): @@ -220,10 +247,8 @@ def _init_value_cache( ) size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) self._value_cache = BigTableValueCache(ttl=ttl, size=size) - self.is_complete = self._value_cache.is_complete else: self._value_cache = None - self.is_complete = False class BigTableStore(base.SerializedStore): @@ -273,9 +298,6 @@ def _set_options(self, options) -> None: self.table_name_generator = options.get( BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name ) - self.column_name = options.get( - BigTableStore.BT_COLUMN_NAME_KEY, "DATA" - ) self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" @@ -291,7 +313,6 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): options.get(BigTableStore.BT_INSTANCE_KEY) ) self.bt_table: BT.Table = self.instance.table(self.bt_table_name) - self.column_family_id = "FaustColumnFamily" if not self.bt_table.exists(): logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " @@ -299,7 +320,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - self.column_family_id: BT.column_family.MaxVersionsGCRule( + COLUMN_FAMILY_ID: BT.column_family.MaxVersionsGCRule( 1 ) } @@ -334,9 +355,6 @@ def _bigtable_get_range( # first search cache: rows = BT.RowSet() for bt_key in bt_keys: - if self._cache.has_mutation(bt_key): - return bt_key, self._cache.get_mutation(bt_key) - if self._cache.contains(bt_key): value = self._cache.get(bt_key) return bt_key, value @@ -344,7 +362,6 @@ def _bigtable_get_range( else: rows.add_row_key(bt_key) - # self._flush_mutations() for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) ): @@ -356,22 +373,10 @@ def _bigtable_get_range( return None, None def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): - self._cache.submit_mutation(bt_key, value) + # Update the value cache if any exists self._cache.set(bt_key, value) - - row = self.bt_table.direct_row(bt_key) - if value is None: - row.delete() - else: - row.set_cell( - self.column_family_id, - self.column_name, - value, - ) - self.batcher.mutate(row) - if self.batcher.total_mutation_count == 0: - self.log.info("Flushed mutations") - self._cache.clear_mutations() + # Update the bigtable. Mutations are batched + self._cache.submit_mutation(bt_key, value) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -595,8 +600,8 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: offset_key = self.get_offset_key(tp).encode() row = self.bt_table.direct_row(offset_key) row.set_cell( - self.column_family_id, - self.column_name, + COLUMN_FAMILY_ID, + COLUMN_NAME, str(offset).encode(), ) self.batcher.mutate(row) @@ -669,16 +674,10 @@ def revoke_partitions(self, tps: Set[TP]) -> None: be serving data for. """ for tp in tps: - self.batcher.flush() self._cache.delete_partition(tp.partition) gc.collect() - def _flush_mutations(self): - if self.batcher.total_size > 0: - self.batcher.flush() - def assign_partitions(self, tps: Set[TP]) -> None: - # self._flush_mutations() self._cache.fill({tp.partition for tp in tps}) async def on_rebalance( From b2b0f8f0299c1e4f0cfe206c4bfee08de6e87092 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 10:46:17 +0200 Subject: [PATCH 391/616] fixed some stuff in mutation buffer --- faust/stores/bigtable.py | 42 +++++++++------------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d525cd18b..156a874a0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -128,7 +128,7 @@ def _get_preload_rowset(self, partitions: Set[int]): return row_set, row_filter def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: - row, _ = self.get_mutation(bt_key) + row = self._mutation_rows.get(bt_key, None) row = row if row else self.bt_table.direct_row(bt_key) if value is None: row.delete() @@ -143,18 +143,15 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.total_mutation_count += 1 self.flush_mutations_if_timer_over_or_full() - def get_mutation( - self, bt_key: bytes - ) -> Tuple[Optional[BT.DirectRow], Optional[bytes]]: - row = self._mutation_rows.get(bt_key, None) - value = self._mutation_values.get(bt_key, None) - return row, value + def flush_mutations_if_timer_over_or_full(self, force=False) -> None: + if self.total_mutation_count == 0: + return - def flush_mutations_if_timer_over_or_full(self) -> None: five_min = 5 * 60 if ( self._last_flush + five_min < time.time() or self.total_mutation_count > 10_000 + or force ): self.bt_table.mutate_rows(self._mutation_rows.values()) self._mutation_values.clear() @@ -282,7 +279,6 @@ def __init__( try: self._bigtable_setup(table, options) self._cache = BigTableCacheManager(app, options, self.bt_table) - self.batcher = self.bt_table.mutations_batcher(flush_count=5000) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -484,18 +480,8 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - partitions_to_fill = set(self._active_partitions()) - if self._cache._value_cache is not None: - partitions_to_fill -= self._cache.filled_partitions - for key, value in self._cache.items(): - if value is not None: - yield self._get_faust_key(key), value - - if len(partitions_to_fill) == 0: - return - row_set = BT.RowSet() - for partition in partitions_to_fill: + for partition in self._active_partitions(): prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) row_set.add_row_range_from_keys(prefix_start, prefix_end) @@ -582,11 +568,8 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - row = self.bt_table.read_row(offset_key, filter_=self.row_filter) - if row is None: - return None - else: - return int(self.bigtable_exrtact_row_data(row)) + offset = self._bigtable_get(offset_key) + return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. @@ -598,14 +581,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - row = self.bt_table.direct_row(offset_key) - row.set_cell( - COLUMN_FAMILY_ID, - COLUMN_NAME, - str(offset).encode(), - ) - self.batcher.mutate(row) - + self._bigtable_mutate(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From 782c2da03b894dec18778aeec6ea3e630bd42e9a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 10:47:06 +0200 Subject: [PATCH 392/616] remove items method --- faust/stores/bigtable.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 156a874a0..118678d70 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,12 +195,6 @@ def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: self._value_cache[bt_key] = value - def items(self) -> Iterable[Tuple[bytes, bytes]]: - # Will very likely remove this method in the future - if self._value_cache is not None: - return self._value_cache.data.items() - return [] - def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] From 64f60d311d14178d71d80e8d2811aa6ff86eb480 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 11:06:36 +0200 Subject: [PATCH 393/616] added mutatation rows and values in init --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 118678d70..64a95da59 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -115,6 +115,8 @@ def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self._init_value_cache(options) self.filled_partitions = set() self._last_flush = time.time() + self._mutation_values = {} + self._mutation_rows = {} self.total_mutation_count = 0 def _get_preload_rowset(self, partitions: Set[int]): From 18ceeb4c969e0f8ad39124a2891d008a8cad9a24 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 11:17:29 +0200 Subject: [PATCH 394/616] fixed list cast --- faust/stores/bigtable.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 64a95da59..987ff2c8e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -56,6 +56,7 @@ def get_current_partition(): COLUMN_FAMILY_ID = "FaustColumnFamily" COLUMN_NAME = "DATA" + class BigTableValueCache: """ This is a dictionary which is only filled once, after that, every @@ -155,7 +156,7 @@ def flush_mutations_if_timer_over_or_full(self, force=False) -> None: or self.total_mutation_count > 10_000 or force ): - self.bt_table.mutate_rows(self._mutation_rows.values()) + self.bt_table.mutate_rows(list(self._mutation_rows.values())) self._mutation_values.clear() self._mutation_rows.clear() self.total_mutation_count = 0 @@ -312,9 +313,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table.create( column_families={ - COLUMN_FAMILY_ID: BT.column_family.MaxVersionsGCRule( - 1 - ) + COLUMN_FAMILY_ID: BT.column_family.MaxVersionsGCRule(1) } ) else: From 2987e47257284068c927e251dda0e48785b58181 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 28 Apr 2023 11:18:18 +0200 Subject: [PATCH 395/616] removed force --- faust/stores/bigtable.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 987ff2c8e..19a303493 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -146,7 +146,7 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.total_mutation_count += 1 self.flush_mutations_if_timer_over_or_full() - def flush_mutations_if_timer_over_or_full(self, force=False) -> None: + def flush_mutations_if_timer_over_or_full(self) -> None: if self.total_mutation_count == 0: return @@ -154,7 +154,6 @@ def flush_mutations_if_timer_over_or_full(self, force=False) -> None: if ( self._last_flush + five_min < time.time() or self.total_mutation_count > 10_000 - or force ): self.bt_table.mutate_rows(list(self._mutation_rows.values())) self._mutation_values.clear() From 675afe5c2c8ac7bdaa9df8854b6a215959009232 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 23 May 2023 11:50:54 +0200 Subject: [PATCH 396/616] fixed tests and moved mutation buffer to cache --- faust/stores/bigtable.py | 19 ++-- tests/unit/stores/test_bigtable.py | 149 +++++++---------------------- 2 files changed, 44 insertions(+), 124 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 19a303493..a624b2213 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -146,19 +146,20 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.total_mutation_count += 1 self.flush_mutations_if_timer_over_or_full() - def flush_mutations_if_timer_over_or_full(self) -> None: - if self.total_mutation_count == 0: - return + def flush(self, ): + if self.total_mutation_count > 0: + self.bt_table.mutate_rows(list(self._mutation_rows.values())) + self._mutation_values.clear() + self._mutation_rows.clear() + self.total_mutation_count = 0 + def flush_mutations_if_timer_over_or_full(self) -> None: five_min = 5 * 60 if ( self._last_flush + five_min < time.time() or self.total_mutation_count > 10_000 ): - self.bt_table.mutate_rows(list(self._mutation_rows.values())) - self._mutation_values.clear() - self._mutation_rows.clear() - self.total_mutation_count = 0 + self.flush() def fill(self, partitions: Set[int]): start = time.time() @@ -480,13 +481,13 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: prefix_end = self._get_partition_prefix(partition + 1) row_set.add_row_range_from_keys(prefix_start, prefix_end) + # Write all mutations to bigtable + self._cache.flush() for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): faust_key = self._get_faust_key(row.row_key) value = self.bigtable_exrtact_row_data(row) - if self._cache._value_cache is not None: - self._cache.set(row.row_key, value) yield faust_key, value except Exception as ex: self.log.error( diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2a583a8c6..2060aa1f6 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -9,6 +9,8 @@ BigTableCacheManager, BigTableStore, BigTableValueCache, + COLUMN_FAMILY_ID, + COLUMN_NAME, ) from faust.types.tuples import TP @@ -124,20 +126,17 @@ def test_init(self): assert cache.data == {} assert cache.ttl == -1 assert cache.ttl_over is False - assert cache.is_complete is True # Test with custom size cache = BigTableValueCache(size=123) assert isinstance(cache.data, LRUCache) assert cache.data.limit == 123 - assert cache.is_complete is False # Test with custom ttl cache = BigTableValueCache(ttl=123) assert cache.data == {} assert cache.ttl == 123 assert cache.ttl_over is False - assert cache.is_complete is False def test__set_del_len_and_getitem(self): cache = BigTableValueCache() @@ -311,7 +310,6 @@ def test_contains(self, manager): assert manager.contains(key_in) is True assert manager.contains(key_not_in) is False - manager._value_cache.is_complete = False assert manager.contains(key_in) is True assert manager.contains(key_not_in) is False @@ -329,7 +327,6 @@ def test_contains_any(self, manager): assert manager.contains_any({key_in, key_not_in}) is True assert manager.contains_any({key_not_in}) is False - manager._value_cache.is_complete = False assert manager.contains_any({key_in, key_not_in}) is True assert manager.contains_any({key_not_in}) is False @@ -381,7 +378,6 @@ async def test_bigtable_set_options_default(self, bt_imports): bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") BigTableStore._set_options(self_mock, options={}) - assert self_mock.column_name == "DATA" assert self_mock.offset_key_prefix == "offset_partitiion:" assert self_mock.row_filter == "a_filter" @@ -409,10 +405,8 @@ def from_bt_key(key): options = { BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", - BigTableStore.BT_COLUMN_NAME_KEY: "name_test", } BigTableStore._set_options(self_mock, options) - assert self_mock.column_name == "name_test" assert self_mock.offset_key_prefix == "offset_test" assert self_mock.row_filter == "a_filter" assert self_mock.table_name_generator == name_lambda @@ -458,7 +452,6 @@ def table_name_gen(table): instance_mock.table.assert_called_once_with(self_mock.bt_table_name) table_mock.create.assert_not_called() - assert self_mock.column_family_id == "FaustColumnFamily" assert return_value is None # Test with no existing table @@ -473,9 +466,8 @@ def table_name_gen(table): ) instance_mock.table.assert_called_once_with(self_mock.bt_table_name) table_mock.create.assert_called_once_with( - column_families={self_mock.column_family_id: "a_rule"} + column_families={"FaustColumnFamily": "a_rule"} ) - assert self_mock.column_family_id == "FaustColumnFamily" assert return_value is None @pytest.fixture() @@ -489,8 +481,6 @@ def store(self, bt_imports): "bigtable://", MagicMock(), MagicMock(), options=options ) store.bt_table = BigTableMock() - store.batcher.mutate = MagicMock(wraps=store.batcher.mutate) - store.batcher.flush = MagicMock(wraps=store.batcher.flush) return store def test_bigtable_bigtable_get_on_empty(self, store): @@ -498,7 +488,6 @@ def test_bigtable_bigtable_get_on_empty(self, store): store._cache.set = MagicMock() return_value = store._bigtable_get(self.TEST_KEY1) store._cache.contains.assert_called_with(self.TEST_KEY1) - store.batcher.flush.assert_called_once() store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) @@ -585,7 +574,7 @@ def test_bigtable_delete(self, store): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._cache.set = MagicMock() - store._bigtable_del(self.TEST_KEY1) + store._bigtable_mutate(self.TEST_KEY1, None) store._cache.set.assert_called_once_with(self.TEST_KEY1, None) def test_bigtable_set(self, store): @@ -594,20 +583,13 @@ def test_bigtable_set(self, store): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._cache.set = MagicMock(return_value=None) - store.batcher = MagicMock() - store.batcher.mutate = MagicMock() + store._cache.submit_mutation = MagicMock(return_value=None) + store._bigtable_mutate(self.TEST_KEY1, self.TEST_KEY1) + store._bigtable_mutate(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_set(self.TEST_KEY1, self.TEST_KEY1) - - store.bt_table.direct_row.assert_called_with(self.TEST_KEY1) store._cache.set.assert_called_with(self.TEST_KEY1, self.TEST_KEY1) - row_mock.set_cell.assert_called_with( - store.column_family_id, - store.column_name, - self.TEST_KEY1, - ) - assert store.batcher.mutate.call_count == 2 + store._cache.submit_mutation.assert_called_with(self.TEST_KEY1, self.TEST_KEY1) + def test_maybe_get_partition_from_message(self, store): event_mock = MagicMock() @@ -731,11 +713,11 @@ def test_set(self, store): faust.stores.bigtable.get_current_partition = MagicMock( return_value=partition ) - store._bigtable_set = MagicMock() + store._bigtable_mutate = MagicMock() store._cache.set_partition = MagicMock() store._set(self.TEST_KEY1, b"a_value") key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) - store._bigtable_set.assert_called_once_with( + store._bigtable_mutate.assert_called_once_with( key_with_partition, b"a_value" ) store._cache.set_partition.assert_called_once_with( @@ -745,14 +727,14 @@ def test_set(self, store): def test_del(self, store): store._cache._partition_cache = {self.TEST_KEY1: 19} store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) - store._bigtable_del = MagicMock() + store._bigtable_mutate = MagicMock() store._del(self.TEST_KEY1) calls = [ - call(store._get_bigtable_key(self.TEST_KEY1, 1)), - call(store._get_bigtable_key(self.TEST_KEY1, 3)), - call(store._get_bigtable_key(self.TEST_KEY1, 19)), + call(store._get_bigtable_key(self.TEST_KEY1, 1), None), + call(store._get_bigtable_key(self.TEST_KEY1, 3), None), + call(store._get_bigtable_key(self.TEST_KEY1, 19), None), ] - store._bigtable_del.assert_has_calls(calls) + store._bigtable_mutate.assert_has_calls(calls) assert store._cache._partition_cache == {} def test_active_partitions(self, store): @@ -779,36 +761,20 @@ def test_active_partitions(self, store): assert list(range(store.app.conf.topic_partitions)) == all_res def test_iteritems(self, store): - keys_in_store = [] - k1 = store._get_bigtable_key(self.TEST_KEY1, 1) - k3 = store._get_bigtable_key(self.TEST_KEY3, 3) - # Fill cache - cache_data = { - k1: "CACHE_VALUE1", - k3: "CACHE_VALUE3", - } - - store._cache._value_cache.data = cache_data - store.bt_table.add_test_data(keys_in_store) store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache.fill = MagicMock() + store._cache.flush = MagicMock(wraps=store._cache.flush) store.bt_table.read_rows = MagicMock() - all_res = sorted(store._iteritems()) - store._cache.fill.assert_not_called() - assert all_res == [ - (self.TEST_KEY1, "CACHE_VALUE1"), - (self.TEST_KEY3, "CACHE_VALUE3"), - ] - store._cache.fill.reset_mock() - all_res = sorted(store._iteritems()) + _ = sorted(store._iteritems()) + store._cache.flush.assert_called_once() + _ = sorted(store._iteritems()) assert store.bt_table.read_rows.call_count == 2 store._cache.filled_partitions = {1, 3} store._active_partitions = MagicMock(return_value={1, 3}) - all_res = sorted(store._iteritems()) + _ = sorted(store._iteritems()) # No new calls, we just return what's in the cache - assert store.bt_table.read_rows.call_count == 2 + assert store.bt_table.read_rows.call_count == 3 def test_iterkeys(self, store): values = [("K1", "V1"), ("K2", "V2")] @@ -829,63 +795,16 @@ def test_get_offset_key(self, store): tp = TP("AAAA", 19) assert store.get_offset_key(tp)[-2:] == "19" - def test_persisted_offset(self, store): - tp = TP("AAAA", 19) - store.get_offset_key = MagicMock(return_value=123) - store.bt_table.add_test_data([123]) - assert store.persisted_offset(tp) == 123 - def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) - - store._bigtable_set = MagicMock() + store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) store._cache.flush_if_timer_over = MagicMock(return_value=False) expected_offset_key = store.get_offset_key(tp).encode() store.set_persisted_offset(tp, 123) - store._bigtable_set.assert_called_with( + store._bigtable_mutate.assert_called_with( expected_offset_key, str(123).encode() ) - - def test_persist_changelog_batch(self, store): - # Scenario 1: no failure - store.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(0)] * 10 - ) - store.log = MagicMock() - store.log.error = MagicMock() - store.set_persisted_offset = MagicMock() - tp1 = TP("offset1", 10) - tp2 = TP("offset2", 10) - tp3 = TP("offset3", 10) - offset_batch = { - tp1: 111, - tp2: 222, - tp3: 333, - } - store._persist_changelog_batch( - ["row1", "row2", "etc..."], offset_batch - ) - store.bt_table.mutate_rows.assert_called_with( - ["row1", "row2", "etc..."] - ) - - assert store.set_persisted_offset.call_count == len(offset_batch) - store.set_persisted_offset.assert_called_with(tp3, 333, recovery=True) - store.log.error.assert_not_called() - - # Scenario 2: all failure - store.set_persisted_offset.reset_mock() - store.bt_table.mutate_rows.reset_mock() - store.bt_table.mutate_rows = MagicMock( - return_value=[MyTestResponse(404)] - ) - store._persist_changelog_batch( - ["row1", "row2", "etc..."], offset_batch - ) - # FIXME: I'm not sure if we want that behaviour. - # Question: What should happen on a failed mutated row in recovery. - store.set_persisted_offset.assert_called() - store.log.error.assert_called() + assert store.persisted_offset(tp) == 123 def test_apply_changelog_batch(self, store): row_mock = MagicMock() @@ -893,7 +812,10 @@ def test_apply_changelog_batch(self, store): row_mock.set_cell = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) store.bt_table.mutate_rows = MagicMock() + store._bigtable_mutate = MagicMock() store.set_persisted_offset = MagicMock() + store._cache.submit_mutation = MagicMock() + store._cache.set = MagicMock() class TestMessage: def __init__(self, value, key, tp, offset): @@ -916,9 +838,7 @@ def __init__(self, message): TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), ] store.apply_changelog_batch(messages, lambda x: x, lambda x: x) - assert store.bt_table.direct_row.call_count == 5 - row_mock.delete.assert_called_once() - assert row_mock.set_cell.call_count == 4 + assert store._bigtable_mutate.call_count == 5 assert store.set_persisted_offset.call_count == 2 def test_revoke_partitions(self, store): @@ -940,18 +860,18 @@ def test_mutation_flush(self, store): def real_set_scenario(key, value, offset): store._set(key, value) - store._bigtable_set.reset_mock() + store._bigtable_mutate.reset_mock() store.set_persisted_offset(TEST_TP, offset) return offset + 1 def real_del_scenario(key, offset): store._del(key) - store._bigtable_set.reset_mock() + store._bigtable_mutate.reset_mock() store.set_persisted_offset(TEST_TP, offset) return offset + 1 def assert_offset_persisted(offset): - store._bigtable_set.assert_called_with( + store._bigtable_mutate.assert_called_with( OFFSET_KEY, str(offset).encode() ) @@ -959,15 +879,14 @@ def assert_offset_persisted(offset): row_mock.row_key = b"\x00TEST_KEY1" store.bt_table.direct_row = MagicMock(return_value=row_mock) - store.batcher.mutate = MagicMock(wraps=store.batcher.mutate) - store._bigtable_set = MagicMock(wraps=store._bigtable_set) + store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( return_value=partition ) store._cache.set_partition = MagicMock() res = store._contains(self.TEST_KEY1) - store._bigtable_set.assert_not_called() + store._bigtable_mutate.assert_not_called() assert res is False TEST_OFFSET = real_set_scenario( From 0b7a24c51fa0d5ae0c4b00dc649cbce164763f9f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 24 May 2023 14:14:11 +0200 Subject: [PATCH 397/616] fixed flush mutations --- faust/stores/bigtable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a624b2213..af5f3c976 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -148,10 +148,13 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: def flush(self, ): if self.total_mutation_count > 0: - self.bt_table.mutate_rows(list(self._mutation_rows.values())) + mutation_list = list(self._mutation_rows.items()) + self.bt_table.mutate_rows(mutation_list) self._mutation_values.clear() self._mutation_rows.clear() + self.log.info(f"BigTableStore flushed {self.total_mutation_count} mutations") self.total_mutation_count = 0 + self._last_flush = time.time() def flush_mutations_if_timer_over_or_full(self) -> None: five_min = 5 * 60 From a91b65c1bb64ae0259797fd8cf564e62b11e6b54 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 24 May 2023 15:54:18 +0200 Subject: [PATCH 398/616] flush after apply changelog batch --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index af5f3c976..7e6668fb7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -614,6 +614,7 @@ def apply_changelog_batch( for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) + self._cache.flush() async def backup_partition( self, From 63013afc8c8d43623d2a98fd52370ee5e191e26d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 24 May 2023 16:13:33 +0200 Subject: [PATCH 399/616] added warning with traceback --- faust/tables/recovery.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/tables/recovery.py b/faust/tables/recovery.py index 83e9f2051..ea2ca1990 100644 --- a/faust/tables/recovery.py +++ b/faust/tables/recovery.py @@ -853,7 +853,9 @@ async def detect_aborted_tx(): self._standbys_span = None self.tables.on_standbys_ready() except Exception as ex: - logger.warning(f"Error in recovery {ex}") + # logger.warning(f"Error in recovery {ex}") + # Write a warning with traceback to the log + logger.warning("Error in recovery", exc_info=ex) def flush_buffers(self) -> None: """Flush changelog buffers.""" From 63e830824af96f4e100f8751fe5a84ce7d0f9369 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 24 May 2023 16:26:43 +0200 Subject: [PATCH 400/616] fixed mutation rows --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7e6668fb7..1eebf96a1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -148,7 +148,7 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: def flush(self, ): if self.total_mutation_count > 0: - mutation_list = list(self._mutation_rows.items()) + mutation_list = list(self._mutation_rows.values()) self.bt_table.mutate_rows(mutation_list) self._mutation_values.clear() self._mutation_rows.clear() From f3d9ee191f84c7dd881d88817ae709de5e4b6e17 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 10:11:20 +0200 Subject: [PATCH 401/616] changed min flush mutations --- faust/stores/bigtable.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1eebf96a1..239557654 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -147,12 +147,19 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.flush_mutations_if_timer_over_or_full() def flush(self, ): - if self.total_mutation_count > 0: + # TODO: Make this a setting. + # High values reduce the writes + if self.total_mutation_count > 200: mutation_list = list(self._mutation_rows.values()) + actual_row_count = len(mutation_list) self.bt_table.mutate_rows(mutation_list) self._mutation_values.clear() self._mutation_rows.clear() - self.log.info(f"BigTableStore flushed {self.total_mutation_count} mutations") + self.log.info( + f"BigTableStore: flushed {self.total_mutation_count}" + f"mutations for {self.bt_table.name} table" + f"({actual_row_count=})" + ) self.total_mutation_count = 0 self._last_flush = time.time() From bb66e30351d7826d13cab94c5fb8cbf648505eb4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 11:09:37 +0200 Subject: [PATCH 402/616] refactored log --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 239557654..47236efd3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -157,8 +157,8 @@ def flush(self, ): self._mutation_rows.clear() self.log.info( f"BigTableStore: flushed {self.total_mutation_count}" - f"mutations for {self.bt_table.name} table" - f"({actual_row_count=})" + f" mutations for {self.bt_table.name} table" + f" ({actual_row_count=})" ) self.total_mutation_count = 0 self._last_flush = time.time() From c1736e42acfc69962a49e54e80cc92acc50f90c0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 16:05:42 +0200 Subject: [PATCH 403/616] try faster iteritems --- faust/stores/bigtable.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 47236efd3..6f3ffdae1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -195,6 +195,11 @@ def fill(self, partitions: Set[int]): f"{self.bt_table.name}:{partitions} in {end-start}s" ) + def iteritems(self): + if self._value_cache is None or len(self.filled_partitions) == 0: + return [] + return self._value_cache.data.items() + def get(self, bt_key: bytes) -> Optional[bytes]: if self._mutation_rows.get(bt_key) is not None: return self._mutation_values[bt_key] @@ -485,20 +490,35 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: + + # Write all mutations to bigtable + start = time.time() + self._cache.flush() + if self._cache._value_cache is not None: + for k, v in self._cache.iteritems(): + faust_key = self._get_faust_key(k) + yield faust_key, v + + left_over_partitions = set(self._active_partitions()) + left_over_partitions.difference_update(self._cache.filled_partitions) row_set = BT.RowSet() - for partition in self._active_partitions(): + + for partition in left_over_partitions: prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) row_set.add_row_range_from_keys(prefix_start, prefix_end) - # Write all mutations to bigtable - self._cache.flush() for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): faust_key = self._get_faust_key(row.row_key) value = self.bigtable_exrtact_row_data(row) + self._cache.set(row.row_key, value) yield faust_key, value + self._cache.filled_partitions.update(left_over_partitions) + end = time.time() + self.log.info(f"Time taken for _iteritems {end - start}s") + except Exception as ex: self.log.error( f"FaustBigtableException Error " From 27977cb8c7ec90b20e22ffe5425ddfb6c8a87504 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 16:50:35 +0200 Subject: [PATCH 404/616] don't write offsets in the cache --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6f3ffdae1..ce1878afb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -606,7 +606,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - self._bigtable_mutate(offset_key, str(offset).encode()) + self._cache.submit_mutation(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From e21b1587bf5cc0c8caa0e9d8e0fd3a5a9bfb4972 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 17:10:43 +0200 Subject: [PATCH 405/616] removed set after get --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ce1878afb..2afdb41d7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -351,8 +351,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: value = None else: value = self.bigtable_exrtact_row_data(res) - # Has no effect if value_cace is None - self._cache.set(bt_key, value) return value def _bigtable_get_range( From 3ef552dc824298291901f6ccfa2b00e9ab252083 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 25 May 2023 18:40:50 +0200 Subject: [PATCH 406/616] fixed iterkeys --- faust/stores/bigtable.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2afdb41d7..f4b217128 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -491,14 +491,17 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: # Write all mutations to bigtable start = time.time() + partitions = set(self._active_partitions()) self._cache.flush() if self._cache._value_cache is not None: + self._cache.fill(partitions) for k, v in self._cache.iteritems(): faust_key = self._get_faust_key(k) yield faust_key, v - left_over_partitions = set(self._active_partitions()) - left_over_partitions.difference_update(self._cache.filled_partitions) + left_over_partitions = partitions.difference(self._cache.filled_partitions) + if len(left_over_partitions) == 0: + return row_set = BT.RowSet() for partition in left_over_partitions: From cdb75364707393e8c50a504ac16bec1590e04e4e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 08:53:46 +0200 Subject: [PATCH 407/616] fixed iteritems again and added logs for revoking and assigning partitions --- faust/stores/bigtable.py | 52 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f4b217128..ab1e3445f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -172,7 +172,6 @@ def flush_mutations_if_timer_over_or_full(self) -> None: self.flush() def fill(self, partitions: Set[int]): - start = time.time() partitions = partitions - self.filled_partitions if len(partitions) == 0: return @@ -189,16 +188,6 @@ def fill(self, partitions: Set[int]): self.log.info(f"BigTableStore fill failed for {partitions=}") raise e self.filled_partitions.update(partitions) - end = time.time() - self.log.info( - "BigTableStore: Finished fill for table" - f"{self.bt_table.name}:{partitions} in {end-start}s" - ) - - def iteritems(self): - if self._value_cache is None or len(self.filled_partitions) == 0: - return [] - return self._value_cache.data.items() def get(self, bt_key: bytes) -> Optional[bytes]: if self._mutation_rows.get(bt_key) is not None: @@ -488,23 +477,15 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: - - # Write all mutations to bigtable start = time.time() partitions = set(self._active_partitions()) - self._cache.flush() - if self._cache._value_cache is not None: - self._cache.fill(partitions) - for k, v in self._cache.iteritems(): - faust_key = self._get_faust_key(k) - yield faust_key, v - - left_over_partitions = partitions.difference(self._cache.filled_partitions) - if len(left_over_partitions) == 0: - return row_set = BT.RowSet() - for partition in left_over_partitions: + if len(partitions) == 0: + return + + self._cache.flush() + for partition in partitions: prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) row_set.add_row_range_from_keys(prefix_start, prefix_end) @@ -516,9 +497,9 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: value = self.bigtable_exrtact_row_data(row) self._cache.set(row.row_key, value) yield faust_key, value - self._cache.filled_partitions.update(left_over_partitions) + self._cache.filled_partitions.update(partitions) end = time.time() - self.log.info(f"Time taken for _iteritems {end - start}s") + self.log.info(f"BigTableStore: Time taken for _iteritems {end - start}s") except Exception as ex: self.log.error( @@ -676,12 +657,25 @@ def revoke_partitions(self, tps: Set[TP]) -> None: tps: Set of topic partitions that we should no longer be serving data for. """ - for tp in tps: - self._cache.delete_partition(tp.partition) + partitions = {tp.partition for tp in tps} + for partition in partitions: + self._cache.delete_partition(partition) + + self.log.info( + f"BigTableStore: Revoked partitions {partitions=} for table" + f" {self.table_name}" + ) gc.collect() def assign_partitions(self, tps: Set[TP]) -> None: - self._cache.fill({tp.partition for tp in tps}) + start = time.time() + partitions = {tp.partition for tp in tps} + self._cache.fill(partitions) + end = time.time() + self.log.info( + "BigTableStore: Finished assign_partitions for table" + f" {self.table_name}:{partitions} in {end-start}s" + ) async def on_rebalance( self, From 50974bae4ee7682d175b6deff764faf8429cb77f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 08:56:45 +0200 Subject: [PATCH 408/616] mitigate race conditions while logging --- faust/stores/bigtable.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ab1e3445f..6ba6067b9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -146,21 +146,19 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.total_mutation_count += 1 self.flush_mutations_if_timer_over_or_full() - def flush(self, ): + def flush(self): # TODO: Make this a setting. # High values reduce the writes if self.total_mutation_count > 200: - mutation_list = list(self._mutation_rows.values()) - actual_row_count = len(mutation_list) - self.bt_table.mutate_rows(mutation_list) - self._mutation_values.clear() - self._mutation_rows.clear() self.log.info( f"BigTableStore: flushed {self.total_mutation_count}" f" mutations for {self.bt_table.name} table" - f" ({actual_row_count=})" ) + mutation_list = list(self._mutation_rows.values()) self.total_mutation_count = 0 + self.bt_table.mutate_rows(mutation_list) + self._mutation_values.clear() + self._mutation_rows.clear() self._last_flush = time.time() def flush_mutations_if_timer_over_or_full(self) -> None: From e3dba5835152f3275ee98963a5b1ad3f7961b7f0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 09:24:28 +0200 Subject: [PATCH 409/616] adjusted logging and faster iteritems if value cache is enabled --- faust/stores/bigtable.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6ba6067b9..81b6f29f1 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -151,7 +151,7 @@ def flush(self): # High values reduce the writes if self.total_mutation_count > 200: self.log.info( - f"BigTableStore: flushed {self.total_mutation_count}" + f"[^----BigTableStore: bigtable:] flushed {self.total_mutation_count}" f" mutations for {self.bt_table.name} table" ) mutation_list = list(self._mutation_rows.values()) @@ -477,12 +477,20 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: start = time.time() partitions = set(self._active_partitions()) - row_set = BT.RowSet() if len(partitions) == 0: return self._cache.flush() + if self._cache._value_cache is not None: + # If there is a value cache, we can return the values + self._cache.fill(partitions) + for k, v in self._cache._value_cache.data.items(): + faust_key = self._get_faust_key(k) + yield faust_key, v + return + + row_set = BT.RowSet() for partition in partitions: prefix_start = self._get_partition_prefix(partition) prefix_end = self._get_partition_prefix(partition + 1) @@ -497,7 +505,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: yield faust_key, value self._cache.filled_partitions.update(partitions) end = time.time() - self.log.info(f"BigTableStore: Time taken for _iteritems {end - start}s") + self.log.info(f"{self.table_name} _iteritems took {end - start}s") except Exception as ex: self.log.error( @@ -660,7 +668,7 @@ def revoke_partitions(self, tps: Set[TP]) -> None: self._cache.delete_partition(partition) self.log.info( - f"BigTableStore: Revoked partitions {partitions=} for table" + f"Revoked partitions {partitions=} for table" f" {self.table_name}" ) gc.collect() @@ -671,7 +679,7 @@ def assign_partitions(self, tps: Set[TP]) -> None: self._cache.fill(partitions) end = time.time() self.log.info( - "BigTableStore: Finished assign_partitions for table" + "Finished assign_partitions for table" f" {self.table_name}:{partitions} in {end-start}s" ) From 0ebe9c54ca67e6ea90d6e326bbc95eaaf858c306 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 09:43:59 +0200 Subject: [PATCH 410/616] always flush. --- faust/stores/bigtable.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 81b6f29f1..62320c3df 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -147,9 +147,7 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: self.flush_mutations_if_timer_over_or_full() def flush(self): - # TODO: Make this a setting. - # High values reduce the writes - if self.total_mutation_count > 200: + if self.total_mutation_count > 0: self.log.info( f"[^----BigTableStore: bigtable:] flushed {self.total_mutation_count}" f" mutations for {self.bt_table.name} table" From 9744db4b8afd49a980e0196f3d09bf00ec1e1009 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 09:44:24 +0200 Subject: [PATCH 411/616] move reset of total mutation count up --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 62320c3df..544d852a9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -152,8 +152,8 @@ def flush(self): f"[^----BigTableStore: bigtable:] flushed {self.total_mutation_count}" f" mutations for {self.bt_table.name} table" ) - mutation_list = list(self._mutation_rows.values()) self.total_mutation_count = 0 + mutation_list = list(self._mutation_rows.values()) self.bt_table.mutate_rows(mutation_list) self._mutation_values.clear() self._mutation_rows.clear() From 2d7984bd39554f3d3ea707bd71aa13e1ecab59a4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 26 May 2023 09:54:29 +0200 Subject: [PATCH 412/616] fixed logging for iteritems --- faust/stores/bigtable.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 544d852a9..31ab0894c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -486,22 +486,21 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for k, v in self._cache._value_cache.data.items(): faust_key = self._get_faust_key(k) yield faust_key, v - return + else: + row_set = BT.RowSet() + for partition in partitions: + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) - row_set = BT.RowSet() - for partition in partitions: - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - faust_key = self._get_faust_key(row.row_key) - value = self.bigtable_exrtact_row_data(row) - self._cache.set(row.row_key, value) - yield faust_key, value - self._cache.filled_partitions.update(partitions) + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + faust_key = self._get_faust_key(row.row_key) + value = self.bigtable_exrtact_row_data(row) + self._cache.set(row.row_key, value) + yield faust_key, value + self._cache.filled_partitions.update(partitions) end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") From f0ddb7da05c890827502b963cc0ff4ef595a0065 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 7 Jun 2023 09:48:07 +0200 Subject: [PATCH 413/616] in get range we want to be aware of the fact, that deletes are done on all partitions --- faust/stores/bigtable.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 31ab0894c..21fb4f581 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -209,8 +209,9 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self._mutation_rows.get(bt_key, None) is not None: + if self._mutation_rows.get(bt_key) is not None: return True + if self._value_cache is not None: return bt_key in self._value_cache.keys() return False @@ -218,12 +219,14 @@ def contains(self, bt_key: bytes) -> Optional[bool]: def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: if not self._mutation_rows.keys().isdisjoint(key_set): return True + if self._value_cache is not None: if not self._value_cache.keys().isdisjoint(key_set): return True return False def delete_partition(self, partition: int): + self.flush() if self._value_cache is not None: keys = set(self._value_cache.keys()) for k in keys: @@ -330,7 +333,7 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: if self._cache.contains(bt_key): return self._cache.get(bt_key) else: - + self.flush() res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None @@ -343,13 +346,18 @@ def _bigtable_get_range( ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: rows = BT.RowSet() + found_deleted = False for bt_key in bt_keys: if self._cache.contains(bt_key): value = self._cache.get(bt_key) - return bt_key, value - + if value is not None: + return bt_key, value + else: + found_deleted = True else: rows.add_row_key(bt_key) + if found_deleted: + return None, None for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) From 53b2baefc29e8ae38493eb15166a701c191baaf0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 7 Jun 2023 10:04:48 +0200 Subject: [PATCH 414/616] remove flush on cache miss --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 21fb4f581..cbf040c55 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -333,7 +333,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: if self._cache.contains(bt_key): return self._cache.get(bt_key) else: - self.flush() res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: value = None From 812bada9ecb80f19ee25b36413caf311621303f2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 11:52:01 +0200 Subject: [PATCH 415/616] hardcode partition cache --- faust/stores/bigtable.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cbf040c55..f60b76805 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -112,7 +112,7 @@ class BigTableCacheManager: def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) + self._partition_cache = LRUCache(limit=10_000) self._init_value_cache(options) self.filled_partitions = set() self._last_flush = time.time() @@ -345,18 +345,21 @@ def _bigtable_get_range( ) -> Tuple[Optional[bytes], Optional[bytes]]: # first search cache: rows = BT.RowSet() - found_deleted = False - for bt_key in bt_keys: - if self._cache.contains(bt_key): - value = self._cache.get(bt_key) - if value is not None: - return bt_key, value - else: - found_deleted = True - else: - rows.add_row_key(bt_key) - if found_deleted: - return None, None + # found_deleted = False + # for bt_key in bt_keys: + # if self._cache.contains(bt_key): + # value = self._cache.get(bt_key) + # if value is not None: + # return bt_key, value + # else: + # found_deleted = True + # else: + # rows.add_row_key(bt_key) + # + # if found_deleted: + # return None, None + self._cache.flush() + self.log.info(f"BigTableStore: _bigtable_get_range {bt_keys=} for {self.table.name}") for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) @@ -672,8 +675,7 @@ def revoke_partitions(self, tps: Set[TP]) -> None: self._cache.delete_partition(partition) self.log.info( - f"Revoked partitions {partitions=} for table" - f" {self.table_name}" + f"Revoked partitions {partitions=} for table" f" {self.table_name}" ) gc.collect() From 85c08fe31219ec0933808faf58a4efaaaea1df20 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 11:52:32 +0200 Subject: [PATCH 416/616] added todo --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f60b76805..7cf978f3f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -112,6 +112,7 @@ class BigTableCacheManager: def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table + # TODO: Use settings to configure self._partition_cache = LRUCache(limit=10_000) self._init_value_cache(options) self.filled_partitions = set() From a6e472036f0293b840dacb87fb83eb4f11fad183 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 13:13:23 +0200 Subject: [PATCH 417/616] removed logs --- faust/stores/bigtable.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7cf978f3f..56123fc3d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -149,10 +149,6 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: def flush(self): if self.total_mutation_count > 0: - self.log.info( - f"[^----BigTableStore: bigtable:] flushed {self.total_mutation_count}" - f" mutations for {self.bt_table.name} table" - ) self.total_mutation_count = 0 mutation_list = list(self._mutation_rows.values()) self.bt_table.mutate_rows(mutation_list) @@ -360,7 +356,7 @@ def _bigtable_get_range( # if found_deleted: # return None, None self._cache.flush() - self.log.info(f"BigTableStore: _bigtable_get_range {bt_keys=} for {self.table.name}") + # self.log.info(f"BigTableStore: _bigtable_get_range {bt_keys=} for {self.table.name}") for row in self.bt_table.read_rows( row_set=rows, filter_=BT.CellsColumnLimitFilter(1) From 929c9209fb770e09c8e3b46e36f453c265ee4904 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 13:48:23 +0200 Subject: [PATCH 418/616] fixed get partition from cache --- faust/stores/bigtable.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 56123fc3d..f85242a32 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -407,6 +407,12 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: partition = self._maybe_get_partition_from_message() + partitions_from_key = set() + if partition is None: + partitions_from_key = set(self._partitions_for_key(key)) + if len(partitions_from_key) == 1: + partition = partitions_from_key.pop() + if partition is not None: key_with_partition = self._get_bigtable_key( key, partition=partition @@ -418,7 +424,7 @@ def _get(self, key: bytes) -> Optional[bytes]: return value else: keys = set() - for partition in self._partitions_for_key(key): + for partition in partitions_from_key: key_with_partition = self._get_bigtable_key( key, partition=partition ) From 9054a9189fd519828f9bc35149267024cba98b8e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 14:15:26 +0200 Subject: [PATCH 419/616] fixed wrong use of partition cache --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f85242a32..a8c85f670 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -424,6 +424,7 @@ def _get(self, key: bytes) -> Optional[bytes]: return value else: keys = set() + partitions_from_key = set(self._partitions_for_key(key)) for partition in partitions_from_key: key_with_partition = self._get_bigtable_key( key, partition=partition From 600cd53c0ede3ac8646f11fb714ef6a657dd71fc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 14:56:21 +0200 Subject: [PATCH 420/616] don't use get many --- faust/stores/bigtable.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a8c85f670..b315f7eac 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -406,37 +406,21 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: + partitions = [] partition = self._maybe_get_partition_from_message() - partitions_from_key = set() - if partition is None: - partitions_from_key = set(self._partitions_for_key(key)) - if len(partitions_from_key) == 1: - partition = partitions_from_key.pop() - if partition is not None: + partitions = [partition] + else: + partitions = self._partitions_for_key(key) + + for partition in partitions: key_with_partition = self._get_bigtable_key( key, partition=partition ) - value = self._bigtable_get(key_with_partition) if value is not None: self._cache.set_partition(key, partition) return value - else: - keys = set() - partitions_from_key = set(self._partitions_for_key(key)) - for partition in partitions_from_key: - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - keys.add(key_with_partition) - - key_with_partition, value = self._bigtable_get_range(keys) - if value is not None: - partition = key_with_partition[0] - self._cache.set_partition(key, partition) - return value - return None except Exception as ex: self.log.error( From e656e8f83a78f67722401f54d8e2a0aab8d0f315 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 13 Jun 2023 15:24:39 +0200 Subject: [PATCH 421/616] faster get from cache and smaller bitable get function --- faust/stores/bigtable.py | 58 +++++++++++----------------------------- 1 file changed, 16 insertions(+), 42 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b315f7eac..0afe61295 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -327,46 +327,10 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: - if self._cache.contains(bt_key): - return self._cache.get(bt_key) - else: - res = self.bt_table.read_row(bt_key, filter_=self.row_filter) - if res is None: - value = None - else: - value = self.bigtable_exrtact_row_data(res) - return value - - def _bigtable_get_range( - self, bt_keys: Set[bytes] - ) -> Tuple[Optional[bytes], Optional[bytes]]: - # first search cache: - rows = BT.RowSet() - # found_deleted = False - # for bt_key in bt_keys: - # if self._cache.contains(bt_key): - # value = self._cache.get(bt_key) - # if value is not None: - # return bt_key, value - # else: - # found_deleted = True - # else: - # rows.add_row_key(bt_key) - # - # if found_deleted: - # return None, None - self._cache.flush() - # self.log.info(f"BigTableStore: _bigtable_get_range {bt_keys=} for {self.table.name}") - - for row in self.bt_table.read_rows( - row_set=rows, filter_=BT.CellsColumnLimitFilter(1) - ): - # First hit will return - val = self.bigtable_exrtact_row_data(row) - return row.row_key, val - - # Not found - return None, None + res = self.bt_table.read_row(bt_key, filter_=self.row_filter) + if res is None: + return None + return self.bigtable_exrtact_row_data(res) def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): # Update the value cache if any exists @@ -413,11 +377,21 @@ def _get(self, key: bytes) -> Optional[bytes]: else: partitions = self._partitions_for_key(key) + # First we search the cache for partition in partitions: - key_with_partition = self._get_bigtable_key( + bt_key = self._get_bigtable_key( + key, partition=partition + ) + if self._cache.contains(bt_key): + self._cache.set_partition(key, partition) + return self._cache.get(bt_key) + + # Then we search the bigtable + for partition in partitions: + bt_key = self._get_bigtable_key( key, partition=partition ) - value = self._bigtable_get(key_with_partition) + value = self._bigtable_get(bt_key) if value is not None: self._cache.set_partition(key, partition) return value From 71db27838a27e57f04e6450420d5f0fb5e1148cd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 11:21:53 +0200 Subject: [PATCH 422/616] adjust parameters and removed value cache completely --- tests/unit/stores/test_bigtable.py | 86 ++++-------------------------- 1 file changed, 9 insertions(+), 77 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2060aa1f6..d13956edf 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -6,11 +6,8 @@ import faust from faust.stores.bigtable import ( - BigTableCacheManager, + BigTableCache, BigTableStore, - BigTableValueCache, - COLUMN_FAMILY_ID, - COLUMN_NAME, ) from faust.types.tuples import TP @@ -119,72 +116,7 @@ def add_test_data(self, keys): self.data[k] = k -class TestBigTableValueCache: - def test_init(self): - # Test defaults - cache = BigTableValueCache() - assert cache.data == {} - assert cache.ttl == -1 - assert cache.ttl_over is False - - # Test with custom size - cache = BigTableValueCache(size=123) - assert isinstance(cache.data, LRUCache) - assert cache.data.limit == 123 - - # Test with custom ttl - cache = BigTableValueCache(ttl=123) - assert cache.data == {} - assert cache.ttl == 123 - assert cache.ttl_over is False - - def test__set_del_len_and_getitem(self): - cache = BigTableValueCache() - # Scenario ttl not over and no clear - cache._maybe_ttl_clear = MagicMock() - assert len(cache) == 0 - cache["123"] = 123 - assert cache._maybe_ttl_clear.call_count == 1 - assert len(cache) == 1 - assert cache["123"] == 123 - assert cache._maybe_ttl_clear.call_count == 2 - del cache["123"] - assert cache._maybe_ttl_clear.call_count == 2 - assert len(cache) == 0 - - def test__set_del_len_and_getitem_after_tttl(self): - cache = BigTableValueCache() - # Scenario ttl over and clear - cache._maybe_ttl_clear = MagicMock() - cache.ttl_over = True - assert len(cache) == 0 - cache["123"] = 123 - assert cache._maybe_ttl_clear.call_count == 1 - assert len(cache) == 0 - assert "123" not in cache.keys() - assert cache._maybe_ttl_clear.call_count == 1 - del cache["123"] - assert cache._maybe_ttl_clear.call_count == 1 - assert len(cache) == 0 - - def test_maybe_ttl_clear(self): - time.time = MagicMock(return_value=0) - cache = BigTableValueCache(ttl=5) - assert cache.init_ts == 0 - - cache._maybe_ttl_clear() - assert cache.ttl_over is False # Nothing cleared - - time.time.return_value = 5 - cache._maybe_ttl_clear() - assert cache.ttl_over is False # Nothing cleared, edge case - - time.time.return_value = 6 - cache._maybe_ttl_clear() - assert cache.ttl_over is True # Nothing cleared, edge case - - -class TestBigTableCacheManager: +class TestBigTableCache: def test_default__init__(self): bigtable_mock = BigTableMock() app_mock = MagicMock() @@ -192,7 +124,7 @@ def test_default__init__(self): app_mock.conf.table_key_index_size = 123 time.time = MagicMock(return_value=0) - test_manager = BigTableCacheManager(MagicMock(), {}, bigtable_mock) + test_manager = BigTableCache(MagicMock(), {}, bigtable_mock) assert test_manager.bt_table == bigtable_mock assert test_manager._value_cache is None @@ -203,14 +135,14 @@ def test_iscomplete__init__(self): app_mock.conf.table_key_index_size = 2 time.time = MagicMock(return_value=0) options = { - BigTableStore.VALUE_CACHE_ENABLE_KEY: True, + BigTableStore.BT_VALUE_CACHE_ENABLE_KEY: True, } - test_manager = BigTableCacheManager( + test_manager = BigTableCache( MagicMock(), options, bigtable_mock ) assert test_manager.bt_table == bigtable_mock - assert isinstance(test_manager._value_cache, BigTableValueCache) + assert isinstance(test_manager._value_cache, dict) @pytest.fixture() def bt_imports(self): @@ -234,9 +166,9 @@ def manager(self, bt_imports): app_mock.conf.table_key_index_size = 123 options = { - BigTableStore.VALUE_CACHE_ENABLE_KEY: True, + BigTableStore.BT_VALUE_CACHE_ENABLE_KEY: True, } - manager = BigTableCacheManager( + manager = BigTableCache( MagicMock(), options, bigtable_mock ) manager._partition_cache = {} @@ -476,7 +408,7 @@ def store(self, bt_imports): options = {} options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" options[BigTableStore.BT_PROJECT_KEY] = "bt_project" - options[BigTableStore.VALUE_CACHE_ENABLE_KEY] = True + options[BigTableStore.BT_VALUE_CACHE_ENABLE_KEY] = True store = BigTableStore( "bigtable://", MagicMock(), MagicMock(), options=options ) From 43750a530eda9b7206fc0f46b7fbca5b08750eff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 11:44:58 +0200 Subject: [PATCH 423/616] refactored settings --- faust/stores/bigtable.py | 100 +++++++++------------------------------ 1 file changed, 23 insertions(+), 77 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0afe61295..63be59fc3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -57,65 +57,22 @@ def get_current_partition(): COLUMN_NAME = "DATA" -class BigTableValueCache: - """ - This is a dictionary which is only filled once, after that, every - successful access to a key, will remove it. - """ - - data: Union[Dict, LRUCache] - - def __init__(self, ttl=-1, size: Optional[int] = None) -> None: - self.log = logging.getLogger(self.__class__.__name__) - if size is not None: - self.data = LRUCache(limit=size) - else: - self.data = {} - self.ttl = ttl - self.ttl_over = False - self.init_ts = int(time.time()) - - def __len__(self): - return len(self.data) - - def __getitem__(self, key): - if not self.ttl_over: - res = self.data[key] - self._maybe_ttl_clear() - return res - - def __setitem__(self, key, value) -> None: - self._maybe_ttl_clear() - if not self.ttl_over: - self.data[key] = value - - def __delitem__(self, key): - self.data.pop(key, None) - - def _maybe_ttl_clear(self): - if self.ttl != -1 and not self.ttl_over: - now = int(time.time()) - if now > self.init_ts + self.ttl: - self.data = {} - self.ttl_over = True - - def keys(self): - return self.data.keys() - - -class BigTableCacheManager: +class BigTableCache: _partition_cache: LRUCache[bytes, int] - _value_cache: Optional[BigTableValueCache] + _value_cache: Optional[Dict] _mutation_values: Dict[bytes, Optional[bytes]] _mutation_rows: Dict[bytes, BT.DirectRow] def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: self.log = logging.getLogger(__name__) self.bt_table: BT.Table = bt_table - # TODO: Use settings to configure - self._partition_cache = LRUCache(limit=10_000) + self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) self._init_value_cache(options) self.filled_partitions = set() + + self._flush_freq = options.get( + BigTableStore.BT_MUTATION_FLUSH_FREQ_SECONDS_KEY, 5 * 60 + ) self._last_flush = time.time() self._mutation_values = {} self._mutation_rows = {} @@ -133,7 +90,10 @@ def _get_preload_rowset(self, partitions: Set[int]): def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: row = self._mutation_rows.get(bt_key, None) - row = row if row else self.bt_table.direct_row(bt_key) + if row is None: + self.total_mutation_count += 1 + row = self.bt_table.direct_row(bt_key) + if value is None: row.delete() else: @@ -144,7 +104,6 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: ) self._mutation_values[bt_key] = value self._mutation_rows[bt_key] = row - self.total_mutation_count += 1 self.flush_mutations_if_timer_over_or_full() def flush(self): @@ -157,9 +116,8 @@ def flush(self): self._last_flush = time.time() def flush_mutations_if_timer_over_or_full(self) -> None: - five_min = 5 * 60 if ( - self._last_flush + five_min < time.time() + self._last_flush + self._flush_freq < time.time() or self.total_mutation_count > 10_000 ): self.flush() @@ -228,19 +186,12 @@ def delete_partition(self, partition: int): keys = set(self._value_cache.keys()) for k in keys: if k[0] == partition: - del self._value_cache[k] + self._value_cache.pop(k, None) self._partition_cache.pop(k[1:], None) - def _init_value_cache( - self, options - ) -> Optional[Union[LRUCache, BigTableValueCache]]: - enable = options.get(BigTableStore.VALUE_CACHE_ENABLE_KEY, False) - if enable: - ttl = options.get( - BigTableStore.VALUE_CACHE_INVALIDATION_TIME_KEY, -1 - ) - size = options.get(BigTableStore.VALUE_CACHE_SIZE_KEY, None) - self._value_cache = BigTableValueCache(ttl=ttl, size=size) + def _init_value_cache(self, options): + if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, False): + self._value_cache = {} else: self._value_cache = None @@ -251,7 +202,7 @@ class BigTableStore(base.SerializedStore): client: BT.Client instance: BT.Instance bt_table: BT.Table - _cache: BigTableCacheManager + _cache: BigTableCache _db_lock: asyncio.Lock BT_COLUMN_NAME_KEY = "bt_column_name_key" @@ -259,9 +210,8 @@ class BigTableStore(base.SerializedStore): BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - VALUE_CACHE_INVALIDATION_TIME_KEY = "value_cache_invalidation_time_key" - VALUE_CACHE_SIZE_KEY = "value_cache_size_key" - VALUE_CACHE_ENABLE_KEY = "value_cache_enable_key" + BT_VALUE_CACHE_ENABLE_KEY = "bt_value_cache_enable_key" + BT_MUTATION_FLUSH_FREQ_SECONDS_KEY = "bt_mutation_flush_freq_seconds_key" def __init__( self, @@ -275,7 +225,7 @@ def __init__( self._log_counter = 0 try: self._bigtable_setup(table, options) - self._cache = BigTableCacheManager(app, options, self.bt_table) + self._cache = BigTableCache(app, options, self.bt_table) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -379,18 +329,14 @@ def _get(self, key: bytes) -> Optional[bytes]: # First we search the cache for partition in partitions: - bt_key = self._get_bigtable_key( - key, partition=partition - ) + bt_key = self._get_bigtable_key(key, partition=partition) if self._cache.contains(bt_key): self._cache.set_partition(key, partition) return self._cache.get(bt_key) # Then we search the bigtable for partition in partitions: - bt_key = self._get_bigtable_key( - key, partition=partition - ) + bt_key = self._get_bigtable_key(key, partition=partition) value = self._bigtable_get(bt_key) if value is not None: self._cache.set_partition(key, partition) @@ -455,7 +401,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: if self._cache._value_cache is not None: # If there is a value cache, we can return the values self._cache.fill(partitions) - for k, v in self._cache._value_cache.data.items(): + for k, v in self._cache._value_cache.items(): faust_key = self._get_faust_key(k) yield faust_key, v else: From 987b7820171f89ab3503e547617c9a2dbac6d0b2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 13:23:08 +0200 Subject: [PATCH 424/616] added options to logging --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 63be59fc3..805cd142c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -259,7 +259,7 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): if not self.bt_table.exists(): logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " - f"for {table.name}" + f"for {table.name} with {options=}" ) self.bt_table.create( column_families={ From 61d7e96f9a2e26cfab95432c39d3e75c1170921c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 13:35:36 +0200 Subject: [PATCH 425/616] added loggingh --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 805cd142c..60fac2a35 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -269,7 +269,8 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): else: logging.getLogger(__name__).info( "BigTableStore: Using existing " - f"bigtablestore with {self.bt_table_name=} for {table.name}" + f"bigtablestore with {self.bt_table_name=} for {table.name} " + f"with {options=}" ) @staticmethod From 7a979067d1dd51e018b73a48cd75895a37fef6d5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 14:12:40 +0200 Subject: [PATCH 426/616] removed log --- faust/stores/bigtable.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 60fac2a35..94e6811ed 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -141,13 +141,11 @@ def fill(self, partitions: Set[int]): self.filled_partitions.update(partitions) def get(self, bt_key: bytes) -> Optional[bytes]: - if self._mutation_rows.get(bt_key) is not None: + if self._mutation_rows.get(bt_key, None) is not None: return self._mutation_values[bt_key] + if self._value_cache is not None: return self._value_cache[bt_key] - raise NotImplementedError( - f"get is not implemented for {self.__class__} with no value cache" - ) def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: @@ -164,7 +162,7 @@ def contains(self, bt_key: bytes) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if self._mutation_rows.get(bt_key) is not None: + if self._mutation_rows.get(bt_key, None) is not None: return True if self._value_cache is not None: From 33fabab6d65079e5d7c846dca49b021647164501 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 14 Jun 2023 14:54:00 +0200 Subject: [PATCH 427/616] fixed tests --- tests/unit/stores/test_bigtable.py | 121 ++++++----------------------- 1 file changed, 25 insertions(+), 96 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index d13956edf..2934a20e8 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -207,8 +207,7 @@ def test_get(self, manager): manager.get(key_not_in) manager._value_cache = None - with pytest.raises(NotImplementedError): - manager.get(key_in) + assert manager.get(key_in) is None def test_set(self, manager): key_1 = b"\x13AAA" @@ -416,89 +415,12 @@ def store(self, bt_imports): return store def test_bigtable_bigtable_get_on_empty(self, store): - store._cache.contains = MagicMock(return_value=False) - store._cache.set = MagicMock() return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.contains.assert_called_with(self.TEST_KEY1) store.bt_table.read_row.assert_called_once_with( self.TEST_KEY1, filter_="a_filter" ) - store._cache.set.assert_called_with(self.TEST_KEY1, None) assert return_value is None - store._cache.contains = MagicMock(return_value=None) - return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.contains.assert_called_with(self.TEST_KEY1) - store.bt_table.read_row.assert_called_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value is None - - def test_bigtable_bigtable_get_cache_miss(self, store): - store._cache.contains = MagicMock(return_value=None) - store.bt_table.add_test_data([self.TEST_KEY1]) - return_value = store._bigtable_get(self.TEST_KEY1) - store.bt_table.read_row.assert_called_once_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value == self.TEST_KEY1 - - def test_bigtable_bigtable_get_cache_hit(self, store): - store.bt_table.add_test_data([self.TEST_KEY1]) - store._cache.contains = MagicMock(return_value=True) - store._cache.get = MagicMock(return_value=b"cache_res") - return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_called_once_with(self.TEST_KEY1) - store.bt_table.read_row.assert_not_called() - assert return_value == b"cache_res" - - store._cache.contains = MagicMock(return_value=False) - store._cache.get = MagicMock() - return_value = store._bigtable_get(self.TEST_KEY1) - store._cache.get.assert_not_called() - store.bt_table.read_row.assert_called_once_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value == self.TEST_KEY1 - - def test_bigtable_get_range_cache_miss(self, store): - store._cache.contains = MagicMock(return_value=None) - - test_keys_in = [self.TEST_KEY1, self.TEST_KEY3] # order is important - test_keys_not_in = { - self.TEST_KEY2, - } - - return_value = store._bigtable_get_range(test_keys_not_in) - store.bt_table.read_rows.assert_called() - store.bt_table.read_rows.reset_mock() - assert return_value == (None, None) - - store.bt_table.add_test_data(test_keys_in) - return_value = store._bigtable_get_range(test_keys_in) - store.bt_table.read_rows.assert_called() - store.bt_table.read_rows.reset_mock() - assert return_value == ( - self.TEST_KEY1, - self.TEST_KEY1, - ) or return_value == ((self.TEST_KEY3, self.TEST_KEY3)) - - def test_bigtable_get_range_cache_hit(self, store): - store._cache.get = MagicMock(return_value="cache_res") - store._cache.contains = MagicMock(return_value=False) - result_value = store._bigtable_get_range( - [self.TEST_KEY1, self.TEST_KEY3] - ) - store.bt_table.read_rows.assert_not_called - assert result_value == (None, None) - - store._cache.contains = MagicMock(return_value=True) - result_value = store._bigtable_get_range( - [self.TEST_KEY1, self.TEST_KEY3] - ) - store.bt_table.read_rows.assert_not_called - assert result_value == (self.TEST_KEY1, "cache_res") - def test_bigtable_delete(self, store): row_mock = MagicMock() row_mock.commit = MagicMock() @@ -614,7 +536,7 @@ def test_get_with_known_partition(self, store): def test_get_with_unknown_partition(self, store): store._maybe_get_partition_from_message = MagicMock(return_value=None) - store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) + store._partitions_for_key = MagicMock(return_value=[19, 3, 19]) store._cache.set_partition = MagicMock() keys_searched = set() keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 1)) @@ -623,20 +545,19 @@ def test_get_with_unknown_partition(self, store): # Scenario: Found key_of_value = store._get_bigtable_key(self.TEST_KEY1, 19) - store._bigtable_get_range = MagicMock( - return_value=(key_of_value, b"a_value") + store._bigtable_get = MagicMock( + return_value=b"a_value" ) res = store._get(self.TEST_KEY1) store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) - store._bigtable_get_range.assert_called_once_with(keys_searched) store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, 19) assert res == b"a_value" store._cache.set_partition.reset_mock() # Scenario: Not Found - store._bigtable_get_range = MagicMock(return_value=(None, None)) + store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY1) - store._bigtable_get_range.assert_called_once_with(keys_searched) + assert store._bigtable_get.call_count == 3 store._cache.set_partition.assert_not_called() assert res is None @@ -692,21 +613,28 @@ def test_active_partitions(self, store): all_res = list(res) assert list(range(store.app.conf.topic_partitions)) == all_res - def test_iteritems(self, store): + def test_iteritems_with_cache(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) store._cache.flush = MagicMock(wraps=store._cache.flush) + store._cache.fill = MagicMock() store.bt_table.read_rows = MagicMock() _ = sorted(store._iteritems()) store._cache.flush.assert_called_once() + store._cache.fill.assert_called_once() _ = sorted(store._iteritems()) - assert store.bt_table.read_rows.call_count == 2 + store.bt_table.read_rows.assert_not_called() + + def test_iteritems(self, store): + store._active_partitions = MagicMock(return_value=[1, 3]) + store.bt_table.read_rows = MagicMock() + store._cache._value_cache = None + store._cache.flush = MagicMock(wraps=store._cache.flush) - store._cache.filled_partitions = {1, 3} - store._active_partitions = MagicMock(return_value={1, 3}) _ = sorted(store._iteritems()) - # No new calls, we just return what's in the cache - assert store.bt_table.read_rows.call_count == 3 + store._cache.flush.assert_called_once() + _ = sorted(store._iteritems()) + store.bt_table.read_rows.assert_called() def test_iterkeys(self, store): values = [("K1", "V1"), ("K2", "V2")] @@ -729,14 +657,13 @@ def test_get_offset_key(self, store): def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) - store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) + store._cache.submit_mutation = MagicMock(wraps=store._cache.submit_mutation) store._cache.flush_if_timer_over = MagicMock(return_value=False) expected_offset_key = store.get_offset_key(tp).encode() store.set_persisted_offset(tp, 123) - store._bigtable_mutate.assert_called_with( + store._cache.submit_mutation.assert_called_with( expected_offset_key, str(123).encode() ) - assert store.persisted_offset(tp) == 123 def test_apply_changelog_batch(self, store): row_mock = MagicMock() @@ -798,12 +725,12 @@ def real_set_scenario(key, value, offset): def real_del_scenario(key, offset): store._del(key) - store._bigtable_mutate.reset_mock() + store._cache.submit_mutation.reset_mock() store.set_persisted_offset(TEST_TP, offset) return offset + 1 def assert_offset_persisted(offset): - store._bigtable_mutate.assert_called_with( + store._cache.submit_mutation.assert_called_with( OFFSET_KEY, str(offset).encode() ) @@ -812,6 +739,8 @@ def assert_offset_persisted(offset): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) + store._cache.submit_mutation = MagicMock() + partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( return_value=partition From 4273b960b4218a67bd592555724d238ae5dfcdea Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Jun 2023 12:44:00 +0200 Subject: [PATCH 428/616] fixed get with wrong partitions --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 94e6811ed..36a854c7e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -108,6 +108,7 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: def flush(self): if self.total_mutation_count > 0: + self.log.info(f"Flushing {self.total_mutation_count} mutations") self.total_mutation_count = 0 mutation_list = list(self._mutation_rows.values()) self.bt_table.mutate_rows(mutation_list) @@ -324,7 +325,7 @@ def _get(self, key: bytes) -> Optional[bytes]: if partition is not None: partitions = [partition] else: - partitions = self._partitions_for_key(key) + partitions = set(self._partitions_for_key(key)) # First we search the cache for partition in partitions: @@ -495,6 +496,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() + self._cache.flush() offset = self._bigtable_get(offset_key) return int(offset) if offset is not None else None From 235df0a7bd67e8e82e52b8f670960844d24784fb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Jun 2023 13:30:51 +0200 Subject: [PATCH 429/616] removed contains any --- faust/stores/bigtable.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 36a854c7e..57d0b7a08 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -170,15 +170,6 @@ def contains(self, bt_key: bytes) -> Optional[bool]: return bt_key in self._value_cache.keys() return False - def contains_any(self, key_set: Set[bytes]) -> Optional[bool]: - if not self._mutation_rows.keys().isdisjoint(key_set): - return True - - if self._value_cache is not None: - if not self._value_cache.keys().isdisjoint(key_set): - return True - return False - def delete_partition(self, partition: int): self.flush() if self._value_cache is not None: From 7ffad62cacb283a2befcf823bb379a34157d1e35 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 15 Jun 2023 13:37:47 +0200 Subject: [PATCH 430/616] fixed wrong return value if no partition was found and cache was accessed --- faust/stores/bigtable.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 57d0b7a08..6a70cc3be 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -158,15 +158,16 @@ def get_partition(self, user_key: bytes) -> int: def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition - def contains(self, bt_key: bytes) -> Optional[bool]: + def contains(self, bt_key: bytes, with_delete=False) -> Optional[bool]: """ If we return None here, this means, that no assumption about the current key can be made. """ - if self._mutation_rows.get(bt_key, None) is not None: + if with_delete: + return self._mutation_rows.get(bt_key, None) is not None + elif self._mutation_values.get(bt_key, None) is not None: return True - - if self._value_cache is not None: + elif self._value_cache is not None: return bt_key in self._value_cache.keys() return False @@ -319,11 +320,18 @@ def _get(self, key: bytes) -> Optional[bytes]: partitions = set(self._partitions_for_key(key)) # First we search the cache + found_deleted = False for partition in partitions: bt_key = self._get_bigtable_key(key, partition=partition) if self._cache.contains(bt_key): - self._cache.set_partition(key, partition) - return self._cache.get(bt_key) + value = self._cache.get(bt_key) + if value is not None: + self._cache.set_partition(key, partition) + return value + else: + found_deleted = True + if found_deleted: + return None # Then we search the bigtable for partition in partitions: From fe901fbef2b50a9e79681c9d4a4dc9e08c6ba7ba Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Jun 2023 08:55:26 +0200 Subject: [PATCH 431/616] catch exception on mutation flush --- faust/stores/bigtable.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6a70cc3be..51feadf38 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -109,12 +109,22 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: def flush(self): if self.total_mutation_count > 0: self.log.info(f"Flushing {self.total_mutation_count} mutations") - self.total_mutation_count = 0 + # Order is important here, we don't want to repeat mutations + self._last_flush = time.time() mutation_list = list(self._mutation_rows.values()) - self.bt_table.mutate_rows(mutation_list) + try: + self.bt_table.mutate_rows(mutation_list) + except Exception as e: + self.log.warning( + f"BigTableStore: flush failed with {e} " + "will try again on next flush. " + "No data is lost." + ) + return + + self.total_mutation_count = 0 self._mutation_values.clear() self._mutation_rows.clear() - self._last_flush = time.time() def flush_mutations_if_timer_over_or_full(self) -> None: if ( @@ -323,7 +333,7 @@ def _get(self, key: bytes) -> Optional[bytes]: found_deleted = False for partition in partitions: bt_key = self._get_bigtable_key(key, partition=partition) - if self._cache.contains(bt_key): + if self._cache.contains(bt_key, with_deleted=True): value = self._cache.get(bt_key) if value is not None: self._cache.set_partition(key, partition) From b14ade7c31d0a032db0e2fd5d012a396d5959487 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Jun 2023 09:00:33 +0200 Subject: [PATCH 432/616] changed default partition name --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 51feadf38..fc7de71f3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -244,7 +244,7 @@ def _set_options(self, options) -> None: ) self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( - BigTableStore.BT_OFFSET_KEY_PREFIX, "offset_partitiion:" + BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" ) def _bigtable_setup(self, table, options: Dict[str, Any]): From 223bdbf541765e86bbc0055ba23b07bf19adfaf5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Jun 2023 11:04:54 +0200 Subject: [PATCH 433/616] fixed contains call --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fc7de71f3..a9aa094ba 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -333,7 +333,7 @@ def _get(self, key: bytes) -> Optional[bytes]: found_deleted = False for partition in partitions: bt_key = self._get_bigtable_key(key, partition=partition) - if self._cache.contains(bt_key, with_deleted=True): + if self._cache.contains(bt_key, with_delete=True): value = self._cache.get(bt_key) if value is not None: self._cache.set_partition(key, partition) From 266e245d4adda9400b163950e467469ee1fce664 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 16 Jun 2023 11:11:58 +0200 Subject: [PATCH 434/616] =?UTF-8?q?adjusted=20tests=20=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2934a20e8..82d898f95 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -248,23 +248,6 @@ def test_contains(self, manager): assert manager.contains(key_in) is False assert manager.contains(key_not_in) is False - def test_contains_any(self, manager): - # Adding the key here is sufficient, because the cache gets filled - key_in = b"\x13AAA" - key_not_in = b"\x13BBB" - manager.bt_table.add_test_data({key_in}) - - manager.fill({19}) - assert manager.contains_any({key_in, key_not_in}) is True - assert manager.contains_any({key_not_in}) is False - - assert manager.contains_any({key_in, key_not_in}) is True - assert manager.contains_any({key_not_in}) is False - - manager._value_cache = None - assert manager.contains_any({key_in, key_not_in}) is False - assert manager.contains_any({key_not_in}) is False - def test_delete_partition(self, manager): partition = 19 row_mock = MagicMock() @@ -309,7 +292,7 @@ async def test_bigtable_set_options_default(self, bt_imports): bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") BigTableStore._set_options(self_mock, options={}) - assert self_mock.offset_key_prefix == "offset_partitiion:" + assert self_mock.offset_key_prefix == "==>offset_for_partition_" assert self_mock.row_filter == "a_filter" @pytest.mark.asyncio @@ -536,7 +519,7 @@ def test_get_with_known_partition(self, store): def test_get_with_unknown_partition(self, store): store._maybe_get_partition_from_message = MagicMock(return_value=None) - store._partitions_for_key = MagicMock(return_value=[19, 3, 19]) + store._partitions_for_key = MagicMock(return_value=[3, 19]) store._cache.set_partition = MagicMock() keys_searched = set() keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 1)) @@ -544,7 +527,6 @@ def test_get_with_unknown_partition(self, store): keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 19)) # Scenario: Found - key_of_value = store._get_bigtable_key(self.TEST_KEY1, 19) store._bigtable_get = MagicMock( return_value=b"a_value" ) @@ -557,7 +539,7 @@ def test_get_with_unknown_partition(self, store): # Scenario: Not Found store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY1) - assert store._bigtable_get.call_count == 3 + assert store._bigtable_get.call_count == 2 store._cache.set_partition.assert_not_called() assert res is None @@ -739,7 +721,7 @@ def assert_offset_persisted(offset): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) - store._cache.submit_mutation = MagicMock() + store._cache.submit_mutation = MagicMock(wraps=store._cache.submit_mutation) partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( From 108497ea842e481bc433f142707e883630a186ec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 08:30:46 +0200 Subject: [PATCH 435/616] flush on all submitted mutations --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a9aa094ba..37231daf4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -129,7 +129,8 @@ def flush(self): def flush_mutations_if_timer_over_or_full(self) -> None: if ( self._last_flush + self._flush_freq < time.time() - or self.total_mutation_count > 10_000 + # Now we try to flush all the time + or self.total_mutation_count > 0 ): self.flush() From e3a1175b90b8723f849b3ed28b3f5efc2dbc5bdc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 09:38:52 +0200 Subject: [PATCH 436/616] update mutation size to 10k --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 37231daf4..26a668a9a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -129,8 +129,7 @@ def flush(self): def flush_mutations_if_timer_over_or_full(self) -> None: if ( self._last_flush + self._flush_freq < time.time() - # Now we try to flush all the time - or self.total_mutation_count > 0 + or self.total_mutation_count > 10_000 ): self.flush() @@ -160,7 +159,7 @@ def get(self, bt_key: bytes) -> Optional[bytes]: return self._value_cache[bt_key] def set(self, bt_key: bytes, value: Optional[bytes]) -> None: - if self._value_cache is not None: + if self._value_cache is not None and value is not None: self._value_cache[bt_key] = value def get_partition(self, user_key: bytes) -> int: From 1050b15204805b553186e850fb83a6b3c44550e6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 09:40:27 +0200 Subject: [PATCH 437/616] only set if value is not None in value set --- faust/stores/bigtable.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 26a668a9a..be6b2c6af 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -159,8 +159,11 @@ def get(self, bt_key: bytes) -> Optional[bytes]: return self._value_cache[bt_key] def set(self, bt_key: bytes, value: Optional[bytes]) -> None: - if self._value_cache is not None and value is not None: - self._value_cache[bt_key] = value + if self._value_cache is not None: + if value is None: + self._value_cache.pop(bt_key, None) + else: + self._value_cache[bt_key] = value def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] From 2ad775c4fb7fc3020263f3f7d950f4e8e16807f2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 10:48:20 +0200 Subject: [PATCH 438/616] changed to 1000 mutations --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index be6b2c6af..79b9af349 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -129,7 +129,7 @@ def flush(self): def flush_mutations_if_timer_over_or_full(self) -> None: if ( self._last_flush + self._flush_freq < time.time() - or self.total_mutation_count > 10_000 + or self.total_mutation_count > 1_000 ): self.flush() From 632341576c77896b61fb093a55b54f597b9c2b62 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 12:54:17 +0200 Subject: [PATCH 439/616] top --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 79b9af349..be6b2c6af 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -129,7 +129,7 @@ def flush(self): def flush_mutations_if_timer_over_or_full(self) -> None: if ( self._last_flush + self._flush_freq < time.time() - or self.total_mutation_count > 1_000 + or self.total_mutation_count > 10_000 ): self.flush() From 8fb9ce527059a38046b3181c1f49585bb65e40c5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 14:14:25 +0200 Subject: [PATCH 440/616] push max mutations to the max --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index be6b2c6af..9782608ee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -91,8 +91,8 @@ def _get_preload_rowset(self, partitions: Set[int]): def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: row = self._mutation_rows.get(bt_key, None) if row is None: - self.total_mutation_count += 1 row = self.bt_table.direct_row(bt_key) + self.total_mutation_count += 1 if value is None: row.delete() @@ -129,7 +129,8 @@ def flush(self): def flush_mutations_if_timer_over_or_full(self) -> None: if ( self._last_flush + self._flush_freq < time.time() - or self.total_mutation_count > 10_000 + # Google allows a maximum of 100_000 mutattions + or self.total_mutation_count > 99_000 ): self.flush() From 8cb56fa28c76d9bb690518b4b162668d868aa4ef Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 22 Jun 2023 15:10:05 +0200 Subject: [PATCH 441/616] submit directly --- faust/stores/bigtable.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9782608ee..eece439d9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -89,11 +89,8 @@ def _get_preload_rowset(self, partitions: Set[int]): return row_set, row_filter def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: - row = self._mutation_rows.get(bt_key, None) - if row is None: - row = self.bt_table.direct_row(bt_key) - self.total_mutation_count += 1 - + # Directly submit + row = self.bt_table.direct_row(bt_key) if value is None: row.delete() else: @@ -102,9 +99,23 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: COLUMN_NAME, value, ) - self._mutation_values[bt_key] = value - self._mutation_rows[bt_key] = row - self.flush_mutations_if_timer_over_or_full() + row.commit() + # row = self._mutation_rows.get(bt_key, None) + # if row is None: + # row = self.bt_table.direct_row(bt_key) + # self.total_mutation_count += 1 +# + # if value is None: + # row.delete() + # else: + # row.set_cell( + # COLUMN_FAMILY_ID, + # COLUMN_NAME, + # value, + # ) + # self._mutation_values[bt_key] = value + # self._mutation_rows[bt_key] = row + # self.flush_mutations_if_timer_over_or_full() def flush(self): if self.total_mutation_count > 0: From 70592187b9e122d20a1cb9e19e9d8af89489b68b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 09:41:46 +0200 Subject: [PATCH 442/616] fixed contains of value cache --- faust/stores/bigtable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index eece439d9..76448033a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -188,10 +188,10 @@ def contains(self, bt_key: bytes, with_delete=False) -> Optional[bool]: If we return None here, this means, that no assumption about the current key can be made. """ - if with_delete: - return self._mutation_rows.get(bt_key, None) is not None - elif self._mutation_values.get(bt_key, None) is not None: - return True + if self._mutation_rows.get(bt_key, None) is not None: + if with_delete: + return True + return self._mutation_values.get(bt_key, None) is not None elif self._value_cache is not None: return bt_key in self._value_cache.keys() return False From d007287433bd776264cf471971b8c3c3d8a5eddb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 09:59:15 +0200 Subject: [PATCH 443/616] try different approach on contains --- faust/stores/bigtable.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 76448033a..23b431733 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -172,10 +172,7 @@ def get(self, bt_key: bytes) -> Optional[bytes]: def set(self, bt_key: bytes, value: Optional[bytes]) -> None: if self._value_cache is not None: - if value is None: - self._value_cache.pop(bt_key, None) - else: - self._value_cache[bt_key] = value + self._value_cache[bt_key] = value def get_partition(self, user_key: bytes) -> int: return self._partition_cache[user_key] @@ -183,15 +180,13 @@ def get_partition(self, user_key: bytes) -> int: def set_partition(self, user_key: bytes, partition: int): self._partition_cache[user_key] = partition - def contains(self, bt_key: bytes, with_delete=False) -> Optional[bool]: + def contains(self, bt_key: bytes) -> Optional[bool]: """ If we return None here, this means, that no assumption about the current key can be made. """ if self._mutation_rows.get(bt_key, None) is not None: - if with_delete: - return True - return self._mutation_values.get(bt_key, None) is not None + return True elif self._value_cache is not None: return bt_key in self._value_cache.keys() return False @@ -348,7 +343,7 @@ def _get(self, key: bytes) -> Optional[bytes]: found_deleted = False for partition in partitions: bt_key = self._get_bigtable_key(key, partition=partition) - if self._cache.contains(bt_key, with_delete=True): + if self._cache.contains(bt_key): value = self._cache.get(bt_key) if value is not None: self._cache.set_partition(key, partition) @@ -426,6 +421,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: # If there is a value cache, we can return the values self._cache.fill(partitions) for k, v in self._cache._value_cache.items(): + if v is None: + continue faust_key = self._get_faust_key(k) yield faust_key, v else: From 95640b444cf371e72b3f6fc98555cfa39982afae Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 10:21:13 +0200 Subject: [PATCH 444/616] remove cache access entirely --- faust/stores/bigtable.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 23b431733..20c9bf034 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -340,18 +340,18 @@ def _get(self, key: bytes) -> Optional[bytes]: partitions = set(self._partitions_for_key(key)) # First we search the cache - found_deleted = False - for partition in partitions: - bt_key = self._get_bigtable_key(key, partition=partition) - if self._cache.contains(bt_key): - value = self._cache.get(bt_key) - if value is not None: - self._cache.set_partition(key, partition) - return value - else: - found_deleted = True - if found_deleted: - return None + # found_deleted = False + # for partition in partitions: + # bt_key = self._get_bigtable_key(key, partition=partition) + # if self._cache.contains(bt_key): + # value = self._cache.get(bt_key) + # if value is not None: + # self._cache.set_partition(key, partition) + # return value + # else: + # found_deleted = True + # if found_deleted: + # return None # Then we search the bigtable for partition in partitions: From e12ec731c7758de27617b14ab855bab48ac5d4a0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 11:44:11 +0200 Subject: [PATCH 445/616] fix something in asssign partitions where the standby partition was also written into the cache --- faust/stores/bigtable.py | 75 +++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 20c9bf034..10e5b7908 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -89,8 +89,11 @@ def _get_preload_rowset(self, partitions: Set[int]): return row_set, row_filter def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: - # Directly submit - row = self.bt_table.direct_row(bt_key) + row = self._mutation_rows.get(bt_key, None) + if row is None: + row = self.bt_table.direct_row(bt_key) + self.total_mutation_count += 1 + if value is None: row.delete() else: @@ -99,23 +102,9 @@ def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: COLUMN_NAME, value, ) - row.commit() - # row = self._mutation_rows.get(bt_key, None) - # if row is None: - # row = self.bt_table.direct_row(bt_key) - # self.total_mutation_count += 1 -# - # if value is None: - # row.delete() - # else: - # row.set_cell( - # COLUMN_FAMILY_ID, - # COLUMN_NAME, - # value, - # ) - # self._mutation_values[bt_key] = value - # self._mutation_rows[bt_key] = row - # self.flush_mutations_if_timer_over_or_full() + self._mutation_values[bt_key] = value + self._mutation_rows[bt_key] = row + self.flush_mutations_if_timer_over_or_full() def flush(self): if self.total_mutation_count > 0: @@ -254,7 +243,7 @@ def _set_options(self, options) -> None: ) self.row_filter = BT.CellsColumnLimitFilter(1) self.offset_key_prefix = options.get( - BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" + BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" ) def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -298,7 +287,17 @@ def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): # Update the value cache if any exists self._cache.set(bt_key, value) # Update the bigtable. Mutations are batched - self._cache.submit_mutation(bt_key, value) + row = self.bt_table.direct_row(bt_key) + if value is None: + row.delete() + else: + row.set_cell( + COLUMN_FAMILY_ID, + COLUMN_NAME, + value, + ) + row.commit() + # self._cache.submit_mutation(bt_key, value) def _maybe_get_partition_from_message(self) -> Optional[int]: event = current_event() @@ -340,18 +339,18 @@ def _get(self, key: bytes) -> Optional[bytes]: partitions = set(self._partitions_for_key(key)) # First we search the cache - # found_deleted = False - # for partition in partitions: - # bt_key = self._get_bigtable_key(key, partition=partition) - # if self._cache.contains(bt_key): - # value = self._cache.get(bt_key) - # if value is not None: - # self._cache.set_partition(key, partition) - # return value - # else: - # found_deleted = True - # if found_deleted: - # return None + found_deleted = False + for partition in partitions: + bt_key = self._get_bigtable_key(key, partition=partition) + if self._cache.contains(bt_key): + value = self._cache.get(bt_key) + if value is not None: + self._cache.set_partition(key, partition) + return value + else: + found_deleted = True + if found_deleted: + return None # Then we search the bigtable for partition in partitions: @@ -611,7 +610,13 @@ def revoke_partitions(self, tps: Set[TP]) -> None: def assign_partitions(self, tps: Set[TP]) -> None: start = time.time() - partitions = {tp.partition for tp in tps} + + standby_tps = self.app.assignor.assigned_standbys() + my_topics = self.table.changelog_topic.topics + partitions = set() + for tp in tps: + if tp.topic in my_topics and tp not in standby_tps: + partitions.add(tp.partition) self._cache.fill(partitions) end = time.time() self.log.info( @@ -637,4 +642,4 @@ async def on_rebalance( """ async with self._db_lock: self.revoke_partitions(revoked) - self.assign_partitions(newly_assigned) + self.assign_partitions(assigned) From 1b44c27ef2373b56399e073ecc97b32771689d1a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 13:10:45 +0200 Subject: [PATCH 446/616] added logging in get requests --- faust/stores/bigtable.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 10e5b7908..ce319c6ee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -346,6 +346,7 @@ def _get(self, key: bytes) -> Optional[bytes]: value = self._cache.get(bt_key) if value is not None: self._cache.set_partition(key, partition) + self.log.info(f"Found value for key in cache {key=} {value=}") return value else: found_deleted = True @@ -357,6 +358,7 @@ def _get(self, key: bytes) -> Optional[bytes]: bt_key = self._get_bigtable_key(key, partition=partition) value = self._bigtable_get(bt_key) if value is not None: + self.log.info(f"Found value for key in table {key=} {value=}") self._cache.set_partition(key, partition) return value return None @@ -530,7 +532,13 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - self._cache.submit_mutation(offset_key, str(offset).encode()) + row = self.bt_table.direct_row(bt_key) + row.set_cell( + COLUMN_FAMILY_ID, + COLUMN_NAME, + offset, + ) + row.commit() except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From 25251fb55b010090c74e5519fb124d308ebfd3e6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 13:22:24 +0200 Subject: [PATCH 447/616] fix offset --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ce319c6ee..4c6dc29f2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -532,7 +532,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - row = self.bt_table.direct_row(bt_key) + row = self.bt_table.direct_row(offset_key) row.set_cell( COLUMN_FAMILY_ID, COLUMN_NAME, From 25055f891ee9d7a4e44bed68811cd45a1db552fe Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 23 Jun 2023 13:36:37 +0200 Subject: [PATCH 448/616] fixed wrong offset value --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4c6dc29f2..119d7a8cc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -536,7 +536,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: row.set_cell( COLUMN_FAMILY_ID, COLUMN_NAME, - offset, + str(offset).encode(), ) row.commit() except Exception as e: From cfeb4d1390fb2c46e3d9396218f5d9e13d01a855 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 5 Jul 2023 09:28:33 +0200 Subject: [PATCH 449/616] Squashed commit of the following: commit b2a1b5c07ea9d4f8a81b8c2b2c0b877166762639 Author: Johannes Pesenhofer Date: Wed Jul 5 09:21:23 2023 +0200 use partitioner instead of event --- faust/stores/bigtable.py | 60 ++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 119d7a8cc..2edfbe59f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -299,7 +299,7 @@ def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): row.commit() # self._cache.submit_mutation(bt_key, value) - def _maybe_get_partition_from_message(self) -> Optional[int]: + def _maybe_get_partition_from_message_or_key(self, key) -> Optional[int]: event = current_event() if ( event is not None @@ -307,6 +307,8 @@ def _maybe_get_partition_from_message(self) -> Optional[int]: and not self.table.use_partitioner ): return event.message.partition + elif self.table.use_partitioner: + return self.table.partition_for_key(key) else: return None @@ -331,36 +333,28 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - partitions = [] - partition = self._maybe_get_partition_from_message() - if partition is not None: - partitions = [partition] - else: - partitions = set(self._partitions_for_key(key)) + partition = self.table.partition_for_key(key) + bt_key = self._get_bigtable_key(key, partition=partition) - # First we search the cache found_deleted = False - for partition in partitions: - bt_key = self._get_bigtable_key(key, partition=partition) - if self._cache.contains(bt_key): - value = self._cache.get(bt_key) - if value is not None: - self._cache.set_partition(key, partition) - self.log.info(f"Found value for key in cache {key=} {value=}") - return value - else: - found_deleted = True - if found_deleted: - return None - - # Then we search the bigtable - for partition in partitions: - bt_key = self._get_bigtable_key(key, partition=partition) - value = self._bigtable_get(bt_key) + if self._cache.contains(bt_key): + value = self._cache.get(bt_key) if value is not None: - self.log.info(f"Found value for key in table {key=} {value=}") self._cache.set_partition(key, partition) + self.log.info( + f"Found value for key in cache {key=} {value=}" + ) return value + else: + found_deleted = True + if found_deleted: + return None + + value = self._bigtable_get(bt_key) + if value is not None: + self.log.info(f"Found value for key in table {key=} {value=}") + self._cache.set_partition(key, partition) + return value return None except Exception as ex: self.log.error( @@ -370,7 +364,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - partition = get_current_partition() + partition = self.table.partition_for_key(key) key_with_partition = self._get_bigtable_key( key, partition=partition ) @@ -386,12 +380,12 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - for partition in self._partitions_for_key(key): - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - self._bigtable_mutate(key_with_partition, None) - self._cache._partition_cache.pop(key, None) + partition = self.table.partition_for_key(key) + key_with_partition = self._get_bigtable_key( + key, partition=partition + ) + self._bigtable_mutate(key_with_partition, None) + self._cache._partition_cache.pop(key, None) except Exception as ex: self.log.error( f"FaustBigtableException Error in delete for " From ebcae09a9236c161bd997949036688e5b8471ab3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 5 Jul 2023 12:44:57 +0200 Subject: [PATCH 450/616] removed partition assigned and rebalance logic --- faust/stores/bigtable.py | 140 ++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 76 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2edfbe59f..f60b5fab9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -411,30 +411,18 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: if len(partitions) == 0: return - self._cache.flush() - if self._cache._value_cache is not None: - # If there is a value cache, we can return the values - self._cache.fill(partitions) - for k, v in self._cache._value_cache.items(): - if v is None: - continue - faust_key = self._get_faust_key(k) - yield faust_key, v - else: - row_set = BT.RowSet() - for partition in partitions: - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): - faust_key = self._get_faust_key(row.row_key) - value = self.bigtable_exrtact_row_data(row) - self._cache.set(row.row_key, value) - yield faust_key, value - self._cache.filled_partitions.update(partitions) + row_set = BT.RowSet() + for partition in partitions: + prefix_start = self._get_partition_prefix(partition) + prefix_end = self._get_partition_prefix(partition + 1) + row_set.add_row_range_from_keys(prefix_start, prefix_end) + + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): + faust_key = self._get_faust_key(row.row_key) + value = self.bigtable_exrtact_row_data(row) + yield faust_key, value end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") @@ -593,55 +581,55 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - def revoke_partitions(self, tps: Set[TP]) -> None: - """De-assign partitions used on this worker instance. - - Arguments: - table: The table that we store data for. - tps: Set of topic partitions that we should no longer - be serving data for. - """ - partitions = {tp.partition for tp in tps} - for partition in partitions: - self._cache.delete_partition(partition) - - self.log.info( - f"Revoked partitions {partitions=} for table" f" {self.table_name}" - ) - gc.collect() - - def assign_partitions(self, tps: Set[TP]) -> None: - start = time.time() - - standby_tps = self.app.assignor.assigned_standbys() - my_topics = self.table.changelog_topic.topics - partitions = set() - for tp in tps: - if tp.topic in my_topics and tp not in standby_tps: - partitions.add(tp.partition) - self._cache.fill(partitions) - end = time.time() - self.log.info( - "Finished assign_partitions for table" - f" {self.table_name}:{partitions} in {end-start}s" - ) - - async def on_rebalance( - self, - assigned: Set[TP], - revoked: Set[TP], - newly_assigned: Set[TP], - generation_id: int = 0, - ) -> None: - """Rebalance occurred. - - Arguments: - assigned: Set of all assigned topic partitions. - revoked: Set of newly revoked topic partitions. - newly_assigned: Set of newly assigned topic partitions, - for which we were not assigned the last time. - generation_id: the metadata generation identifier for the re-balance - """ - async with self._db_lock: - self.revoke_partitions(revoked) - self.assign_partitions(assigned) + # def revoke_partitions(self, tps: Set[TP]) -> None: + # """De-assign partitions used on this worker instance. +# + # Arguments: + # table: The table that we store data for. + # tps: Set of topic partitions that we should no longer + # be serving data for. + # """ + # partitions = {tp.partition for tp in tps} + # for partition in partitions: + # self._cache.delete_partition(partition) +# + # self.log.info( + # f"Revoked partitions {partitions=} for table" f" {self.table_name}" + # ) + # gc.collect() +# + # def assign_partitions(self, tps: Set[TP]) -> None: + # start = time.time() +# + # standby_tps = self.app.assignor.assigned_standbys() + # my_topics = self.table.changelog_topic.topics + # partitions = set() + # for tp in tps: + # if tp.topic in my_topics and tp not in standby_tps: + # partitions.add(tp.partition) + # self._cache.fill(partitions) + # end = time.time() + # self.log.info( + # "Finished assign_partitions for table" + # f" {self.table_name}:{partitions} in {end-start}s" + # ) +# + # async def on_rebalance( + # self, + # assigned: Set[TP], + # revoked: Set[TP], + # newly_assigned: Set[TP], + # generation_id: int = 0, + # ) -> None: + # """Rebalance occurred. +# + # Arguments: + # assigned: Set of all assigned topic partitions. + # revoked: Set of newly revoked topic partitions. + # newly_assigned: Set of newly assigned topic partitions, + # for which we were not assigned the last time. + # generation_id: the metadata generation identifier for the re-balance + # """ + # async with self._db_lock: + # self.revoke_partitions(revoked) + # self.assign_partitions(assigned) From 7cf68ee5ba883bb9e4a34921049f66694ebfeb9d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 5 Jul 2023 14:08:15 +0200 Subject: [PATCH 451/616] removed partition prefix --- faust/stores/bigtable.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f60b5fab9..5d6edd6ee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -318,12 +318,12 @@ def _get_partition_prefix(self, partition: int) -> bytes: def _get_faust_key(self, key: bytes) -> bytes: faust_key = key[1:] - return faust_key + return key def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: prefix = self._get_partition_prefix(partition) bt_key = prefix + key - return bt_key + return key def _partitions_for_key(self, key: bytes) -> Iterable[int]: try: @@ -406,23 +406,12 @@ def _active_partitions(self) -> Iterator[int]: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: start = time.time() - partitions = set(self._active_partitions()) - - if len(partitions) == 0: - return - - row_set = BT.RowSet() - for partition in partitions: - prefix_start = self._get_partition_prefix(partition) - prefix_end = self._get_partition_prefix(partition + 1) - row_set.add_row_range_from_keys(prefix_start, prefix_end) - - for row in self.bt_table.read_rows( - row_set=row_set, filter_=self.row_filter - ): + for row in self.bt_table.read_rows(filter_=self.row_filter): faust_key = self._get_faust_key(row.row_key) value = self.bigtable_exrtact_row_data(row) - yield faust_key, value + if self.offset_key_prefix in row.row_key: + continue + yield row.row_key, value end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") From d34ca0eeccd526bd6c413ba8cab11daa331ebb6d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 12:00:13 +0200 Subject: [PATCH 452/616] remove partitioning --- faust/stores/bigtable.py | 97 ++++++++-------------------------------- 1 file changed, 19 insertions(+), 78 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5d6edd6ee..38a5d3e42 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -203,7 +203,6 @@ class BigTableStore(base.SerializedStore): instance: BT.Instance bt_table: BT.Table _cache: BigTableCache - _db_lock: asyncio.Lock BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -222,7 +221,11 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - self._log_counter = 0 + if table.use_partitioner is False: + raise ValueError( + "BigTableStore requires a partitioner to be set on the table" + ) + try: self._bigtable_setup(table, options) self._cache = BigTableCache(app, options, self.bt_table) @@ -230,7 +233,6 @@ def __init__( logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) - self._db_lock = asyncio.Lock() @staticmethod def default_translator(user_key): @@ -333,27 +335,19 @@ def _partitions_for_key(self, key: bytes) -> Iterable[int]: def _get(self, key: bytes) -> Optional[bytes]: try: - partition = self.table.partition_for_key(key) - bt_key = self._get_bigtable_key(key, partition=partition) - found_deleted = False - if self._cache.contains(bt_key): - value = self._cache.get(bt_key) + if self._cache.contains(key): + value = self._cache.get(key) if value is not None: - self._cache.set_partition(key, partition) - self.log.info( - f"Found value for key in cache {key=} {value=}" - ) return value else: found_deleted = True if found_deleted: return None - value = self._bigtable_get(bt_key) + value = self._bigtable_get(key) if value is not None: self.log.info(f"Found value for key in table {key=} {value=}") - self._cache.set_partition(key, partition) return value return None except Exception as ex: @@ -364,12 +358,7 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - partition = self.table.partition_for_key(key) - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - self._bigtable_mutate(key_with_partition, value) - self._cache.set_partition(key, partition) + self._bigtable_mutate(key, value) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -379,42 +368,18 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: raise ex def _del(self, key: bytes) -> None: - try: - partition = self.table.partition_for_key(key) - key_with_partition = self._get_bigtable_key( - key, partition=partition - ) - self._bigtable_mutate(key_with_partition, None) - self._cache._partition_cache.pop(key, None) - except Exception as ex: - self.log.error( - f"FaustBigtableException Error in delete for " - f"table {self.table_name} exception {ex} key {key}" - ) - raise ex - - def _active_partitions(self) -> Iterator[int]: - actives = self.app.assignor.assigned_actives() - topic = self.table.changelog_topic_name - for partition in range(self.app.conf.topic_partitions): - tp = TP(topic=topic, partition=partition) - # for global tables, keys from all - # partitions are available. - if tp in actives or self.table.is_global: - yield partition + self._set(key, None) def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: start = time.time() for row in self.bt_table.read_rows(filter_=self.row_filter): - faust_key = self._get_faust_key(row.row_key) value = self.bigtable_exrtact_row_data(row) if self.offset_key_prefix in row.row_key: continue yield row.row_key, value end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") - except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -424,27 +389,12 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: raise ex def _iterkeys(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[0] - except Exception as ex: - self.log.error( - f"FaustBigtableException Error in _iterkeys " - f"for table {self.table_name} exception {ex}" - ) - raise ex + for row in self._iteritems(): + yield row[0] def _itervalues(self) -> Iterator[bytes]: - try: - for row in self._iteritems(): - yield row[1] - except Exception as ex: - self.log.error( - f"FaustBigtableException Error " - f"in _itervalues for table {self.table_name}" - f" exception {ex}" - ) - raise ex + for row in self._iteritems(): + yield row[1] def _size(self) -> int: """Always returns 0 for Bigtable.""" @@ -489,8 +439,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - self._cache.flush() - offset = self._bigtable_get(offset_key) + offset = self._get(offset_key) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -503,13 +452,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - row = self.bt_table.direct_row(offset_key) - row.set_cell( - COLUMN_FAMILY_ID, - COLUMN_NAME, - str(offset).encode(), - ) - row.commit() + self._set(offset_key, str(offset).encode()) except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -539,8 +482,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - bt_key = self._get_bigtable_key(msg.key, partition=tp.partition) - self._bigtable_mutate(bt_key, msg.value) + self._bigtable_mutate(msg.key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) @@ -619,6 +561,5 @@ def restore_backup( # for which we were not assigned the last time. # generation_id: the metadata generation identifier for the re-balance # """ - # async with self._db_lock: - # self.revoke_partitions(revoked) - # self.assign_partitions(assigned) + # self.revoke_partitions(revoked) + # self.assign_partitions(assigned) From e2b5560b94951a2809f481b3c7b8775b0c989c9d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 12:14:38 +0200 Subject: [PATCH 453/616] adjusted batch read for changelogs --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 38a5d3e42..3e6d89d31 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -482,7 +482,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - self._bigtable_mutate(msg.key, msg.value) + self._set(msg.key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From d258ac7164e7da1d060615315bdaf1d527263d1e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 12:16:14 +0200 Subject: [PATCH 454/616] used delete in changelog recovery --- faust/stores/bigtable.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3e6d89d31..b377c9024 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -482,11 +482,13 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - self._set(msg.key, msg.value) + if msg.value is None: + self._del(msg.key) + else: + self._set(msg.key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) - self._cache.flush() async def backup_partition( self, From 140bc6a6dfa411b201b6524a45b56939af942136 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 13:33:50 +0200 Subject: [PATCH 455/616] fixed wrong offset key for comparison --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b377c9024..a93afa363 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -373,9 +373,10 @@ def _del(self, key: bytes) -> None: def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: start = time.time() + offset_key = self.offset_key_prefix.encode() for row in self.bt_table.read_rows(filter_=self.row_filter): value = self.bigtable_exrtact_row_data(row) - if self.offset_key_prefix in row.row_key: + if offset_key in row.row_key: continue yield row.row_key, value end = time.time() From c16e892b83c1f5aa760a204e76bfc461a5edfa05 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 14:16:28 +0200 Subject: [PATCH 456/616] dont cache all mutations --- faust/stores/bigtable.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a93afa363..78f21c46c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -286,9 +286,6 @@ def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: return self.bigtable_exrtact_row_data(res) def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): - # Update the value cache if any exists - self._cache.set(bt_key, value) - # Update the bigtable. Mutations are batched row = self.bt_table.direct_row(bt_key) if value is None: row.delete() From 10f318e088eb939ac42a65961c4b78d28496b2f0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 14:58:48 +0200 Subject: [PATCH 457/616] remove cache and rebalance functions --- faust/stores/bigtable.py | 209 --------------------------------------- 1 file changed, 209 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 78f21c46c..97b63d2ef 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -57,145 +57,6 @@ def get_current_partition(): COLUMN_NAME = "DATA" -class BigTableCache: - _partition_cache: LRUCache[bytes, int] - _value_cache: Optional[Dict] - _mutation_values: Dict[bytes, Optional[bytes]] - _mutation_rows: Dict[bytes, BT.DirectRow] - - def __init__(self, app, options: Dict, bt_table: BT.Table) -> None: - self.log = logging.getLogger(__name__) - self.bt_table: BT.Table = bt_table - self._partition_cache = LRUCache(limit=app.conf.table_key_index_size) - self._init_value_cache(options) - self.filled_partitions = set() - - self._flush_freq = options.get( - BigTableStore.BT_MUTATION_FLUSH_FREQ_SECONDS_KEY, 5 * 60 - ) - self._last_flush = time.time() - self._mutation_values = {} - self._mutation_rows = {} - self.total_mutation_count = 0 - - def _get_preload_rowset(self, partitions: Set[int]): - row_set = BT.RowSet() - row_filter = CellsColumnLimitFilter(1) - for partition in partitions: - preload_id = partition.to_bytes(1, "little") - row_set.add_row_range_from_keys( - start_key=preload_id, end_key=preload_id + b"\xff" - ) - return row_set, row_filter - - def submit_mutation(self, bt_key: bytes, value: Optional[bytes]) -> None: - row = self._mutation_rows.get(bt_key, None) - if row is None: - row = self.bt_table.direct_row(bt_key) - self.total_mutation_count += 1 - - if value is None: - row.delete() - else: - row.set_cell( - COLUMN_FAMILY_ID, - COLUMN_NAME, - value, - ) - self._mutation_values[bt_key] = value - self._mutation_rows[bt_key] = row - self.flush_mutations_if_timer_over_or_full() - - def flush(self): - if self.total_mutation_count > 0: - self.log.info(f"Flushing {self.total_mutation_count} mutations") - # Order is important here, we don't want to repeat mutations - self._last_flush = time.time() - mutation_list = list(self._mutation_rows.values()) - try: - self.bt_table.mutate_rows(mutation_list) - except Exception as e: - self.log.warning( - f"BigTableStore: flush failed with {e} " - "will try again on next flush. " - "No data is lost." - ) - return - - self.total_mutation_count = 0 - self._mutation_values.clear() - self._mutation_rows.clear() - - def flush_mutations_if_timer_over_or_full(self) -> None: - if ( - self._last_flush + self._flush_freq < time.time() - # Google allows a maximum of 100_000 mutattions - or self.total_mutation_count > 99_000 - ): - self.flush() - - def fill(self, partitions: Set[int]): - partitions = partitions - self.filled_partitions - if len(partitions) == 0: - return - - if self._value_cache is not None: - try: - row_set, row_filter = self._get_preload_rowset(partitions) - for row in self.bt_table.read_rows( - row_set=row_set, filter_=row_filter - ): - value = BigTableStore.bigtable_exrtact_row_data(row) - self._value_cache[row.row_key] = value - except Exception as e: - self.log.info(f"BigTableStore fill failed for {partitions=}") - raise e - self.filled_partitions.update(partitions) - - def get(self, bt_key: bytes) -> Optional[bytes]: - if self._mutation_rows.get(bt_key, None) is not None: - return self._mutation_values[bt_key] - - if self._value_cache is not None: - return self._value_cache[bt_key] - - def set(self, bt_key: bytes, value: Optional[bytes]) -> None: - if self._value_cache is not None: - self._value_cache[bt_key] = value - - def get_partition(self, user_key: bytes) -> int: - return self._partition_cache[user_key] - - def set_partition(self, user_key: bytes, partition: int): - self._partition_cache[user_key] = partition - - def contains(self, bt_key: bytes) -> Optional[bool]: - """ - If we return None here, this means, that no assumption - about the current key can be made. - """ - if self._mutation_rows.get(bt_key, None) is not None: - return True - elif self._value_cache is not None: - return bt_key in self._value_cache.keys() - return False - - def delete_partition(self, partition: int): - self.flush() - if self._value_cache is not None: - keys = set(self._value_cache.keys()) - for k in keys: - if k[0] == partition: - self._value_cache.pop(k, None) - self._partition_cache.pop(k[1:], None) - - def _init_value_cache(self, options): - if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, False): - self._value_cache = {} - else: - self._value_cache = None - - class BigTableStore(base.SerializedStore): """Bigtable table storage.""" @@ -228,7 +89,6 @@ def __init__( try: self._bigtable_setup(table, options) - self._cache = BigTableCache(app, options, self.bt_table) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -296,7 +156,6 @@ def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): value, ) row.commit() - # self._cache.submit_mutation(bt_key, value) def _maybe_get_partition_from_message_or_key(self, key) -> Optional[int]: event = current_event() @@ -324,24 +183,8 @@ def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: bt_key = prefix + key return key - def _partitions_for_key(self, key: bytes) -> Iterable[int]: - try: - return [self._cache.get_partition(key)] - except KeyError: - return self._active_partitions() - def _get(self, key: bytes) -> Optional[bytes]: try: - found_deleted = False - if self._cache.contains(key): - value = self._cache.get(key) - if value is not None: - return value - else: - found_deleted = True - if found_deleted: - return None - value = self._bigtable_get(key) if value is not None: self.log.info(f"Found value for key in table {key=} {value=}") @@ -511,55 +354,3 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - - # def revoke_partitions(self, tps: Set[TP]) -> None: - # """De-assign partitions used on this worker instance. -# - # Arguments: - # table: The table that we store data for. - # tps: Set of topic partitions that we should no longer - # be serving data for. - # """ - # partitions = {tp.partition for tp in tps} - # for partition in partitions: - # self._cache.delete_partition(partition) -# - # self.log.info( - # f"Revoked partitions {partitions=} for table" f" {self.table_name}" - # ) - # gc.collect() -# - # def assign_partitions(self, tps: Set[TP]) -> None: - # start = time.time() -# - # standby_tps = self.app.assignor.assigned_standbys() - # my_topics = self.table.changelog_topic.topics - # partitions = set() - # for tp in tps: - # if tp.topic in my_topics and tp not in standby_tps: - # partitions.add(tp.partition) - # self._cache.fill(partitions) - # end = time.time() - # self.log.info( - # "Finished assign_partitions for table" - # f" {self.table_name}:{partitions} in {end-start}s" - # ) -# - # async def on_rebalance( - # self, - # assigned: Set[TP], - # revoked: Set[TP], - # newly_assigned: Set[TP], - # generation_id: int = 0, - # ) -> None: - # """Rebalance occurred. -# - # Arguments: - # assigned: Set of all assigned topic partitions. - # revoked: Set of newly revoked topic partitions. - # newly_assigned: Set of newly assigned topic partitions, - # for which we were not assigned the last time. - # generation_id: the metadata generation identifier for the re-balance - # """ - # self.revoke_partitions(revoked) - # self.assign_partitions(assigned) From b953e28290634a6022697017340b4fa9aa8d5285 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 15:10:22 +0200 Subject: [PATCH 458/616] added cache --- faust/stores/bigtable.py | 44 ++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 97b63d2ef..95c651ead 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -63,7 +63,7 @@ class BigTableStore(base.SerializedStore): client: BT.Client instance: BT.Instance bt_table: BT.Table - _cache: BigTableCache + _cache: Optional[LRUCache] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -87,6 +87,10 @@ def __init__( "BigTableStore requires a partitioner to be set on the table" ) + if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, True): + self._cache = LRUCache(limit=10000) + else: + self._cache = None try: self._bigtable_setup(table, options) except Exception as ex: @@ -157,34 +161,12 @@ def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): ) row.commit() - def _maybe_get_partition_from_message_or_key(self, key) -> Optional[int]: - event = current_event() - if ( - event is not None - and not self.table.is_global - and not self.table.use_partitioner - ): - return event.message.partition - elif self.table.use_partitioner: - return self.table.partition_for_key(key) - else: - return None - - def _get_partition_prefix(self, partition: int) -> bytes: - partition_bytes = partition.to_bytes(1, "little") - return b"".join([partition_bytes]) - - def _get_faust_key(self, key: bytes) -> bytes: - faust_key = key[1:] - return key - - def _get_bigtable_key(self, key: bytes, partition: int) -> bytes: - prefix = self._get_partition_prefix(partition) - bt_key = prefix + key - return key - def _get(self, key: bytes) -> Optional[bytes]: try: + if self._cache is not None: + if key in self._cache: + return self._cache.get(key) + value = self._bigtable_get(key) if value is not None: self.log.info(f"Found value for key in table {key=} {value=}") @@ -198,6 +180,9 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: + if self._cache is not None: + self._cache[key] = value + self._bigtable_mutate(key, value) except Exception as ex: self.log.error( @@ -245,6 +230,11 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True + + # Check cache + if self._cache is not None and key in self._cache: + return self._cache[key] is not None + return self._get(key) is not None except Exception as ex: From 703309fee2f8fd28c1d1b6eaab0a88aee249abff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 7 Jul 2023 15:22:38 +0200 Subject: [PATCH 459/616] add value to cache for read item --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 95c651ead..8263a64a0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -168,6 +168,10 @@ def _get(self, key: bytes) -> Optional[bytes]: return self._cache.get(key) value = self._bigtable_get(key) + + if self._cache is not None: + self._cache[key] = value + if value is not None: self.log.info(f"Found value for key in table {key=} {value=}") return value From 1c08130bfc7c44a0af0b0ea91bec714e69549632 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 09:39:19 +0200 Subject: [PATCH 460/616] added mutation buffer again --- faust/stores/bigtable.py | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8263a64a0..e938ada02 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -4,17 +4,7 @@ import logging import time import traceback -from typing import ( - Any, - Callable, - Dict, - Iterable, - Iterator, - Optional, - Set, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union try: # pragma: no cover from google.cloud.bigtable import column_family @@ -71,7 +61,7 @@ class BigTableStore(base.SerializedStore): BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" BT_VALUE_CACHE_ENABLE_KEY = "bt_value_cache_enable_key" - BT_MUTATION_FLUSH_FREQ_SECONDS_KEY = "bt_mutation_flush_freq_seconds_key" + BT_MAX_MUTATIONS_PER_FLUSH_KEY = "bt_max_mutations_per_flush_key" def __init__( self, @@ -86,11 +76,6 @@ def __init__( raise ValueError( "BigTableStore requires a partitioner to be set on the table" ) - - if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, True): - self._cache = LRUCache(limit=10000) - else: - self._cache = None try: self._bigtable_setup(table, options) except Exception as ex: @@ -112,6 +97,23 @@ def _set_options(self, options) -> None: BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" ) + if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, True): + # TODO - make this configurable + self._cache = LRUCache(limit=10_000) + else: + self._cache = None + + self._mutation_buffer_size = options.get( + BigTableStore.BT_MAX_MUTATIONS_PER_FLUSH_KEY, 0 + ) + + if self._mutation_buffer_size <= 0: + self._mutation_buffer = None + else: + self._mutation_buffer = {} + self._num_mutations = 0 + + def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: BT.Client = BT.Client( @@ -143,14 +145,21 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtable_get(self, bt_key: bytes) -> Optional[bytes]: - res = self.bt_table.read_row(bt_key, filter_=self.row_filter) + def _bigtable_get(self, key: bytes) -> Optional[bytes]: + if self._mutation_buffer is not None: + if key in self._mutation_buffer: + return self._mutation_buffer[key][1] + + res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: return None return self.bigtable_exrtact_row_data(res) - def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): - row = self.bt_table.direct_row(bt_key) + def _bigtable_mutate( + self, key: bytes, value: Optional[bytes] + ): + row = self.bt_table.direct_row(key) + if value is None: row.delete() else: @@ -159,7 +168,12 @@ def _bigtable_mutate(self, bt_key: bytes, value: Optional[bytes]): COLUMN_NAME, value, ) - row.commit() + + if self._mutation_buffer is not None: + self._mutation_buffer[key] = (row, value) + self._num_mutations += 1 + else: + row.commit() def _get(self, key: bytes) -> Optional[bytes]: try: @@ -288,6 +302,21 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: try: offset_key = self.get_offset_key(tp).encode() self._set(offset_key, str(offset).encode()) + + if ( + self._mutation_buffer is not None + and self._num_mutations > self._mutation_buffer_size + ): + mutations = [r[0] for r in self._mutation_buffer.values()] + response = self.bt_table.mutate_rows(mutations) + + for i, status in enumerate(response): + if status.code != 0: + raise Exception(f"Failed to commit mutation number {i}") + else: + self._mutation_buffer.pop(mutations[i].row_key, None) + self._num_mutations -= 1 + except Exception as e: self.log.error( f"Failed to commit offset for {self.table.name}" From 6829b53857f16bffc1415472817c98769615943e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 10:06:42 +0200 Subject: [PATCH 461/616] take mutation with iteritems --- faust/stores/bigtable.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e938ada02..b45782971 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -218,6 +218,17 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: start = time.time() offset_key = self.offset_key_prefix.encode() for row in self.bt_table.read_rows(filter_=self.row_filter): + + if self._mutation_buffer is not None: + mutation_row, mutation_val = self._mutation_buffer.get( + row.row_key, (None, None) + ) + if mutation_val is not None: + yield row.row_key, mutation_val + + if mutation_row is not None: + continue + value = self.bigtable_exrtact_row_data(row) if offset_key in row.row_key: continue From 0e1763ad0b37c06cb5583d0911fc8f06c710f4b2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 10:07:45 +0200 Subject: [PATCH 462/616] continue on beginning if offset key --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b45782971..729f208b3 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -218,6 +218,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: start = time.time() offset_key = self.offset_key_prefix.encode() for row in self.bt_table.read_rows(filter_=self.row_filter): + if offset_key in row.row_key: + continue if self._mutation_buffer is not None: mutation_row, mutation_val = self._mutation_buffer.get( @@ -230,8 +232,6 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: continue value = self.bigtable_exrtact_row_data(row) - if offset_key in row.row_key: - continue yield row.row_key, value end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") From 8e3e4be56a70a90af8fc1768983f5aa0df870ce8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 10:24:14 +0200 Subject: [PATCH 463/616] HARDCODED MUTATION BUFFER SETTINGS --- faust/stores/bigtable.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 729f208b3..3c3b37ceb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -103,15 +103,9 @@ def _set_options(self, options) -> None: else: self._cache = None - self._mutation_buffer_size = options.get( - BigTableStore.BT_MAX_MUTATIONS_PER_FLUSH_KEY, 0 - ) - - if self._mutation_buffer_size <= 0: - self._mutation_buffer = None - else: - self._mutation_buffer = {} - self._num_mutations = 0 + self._mutation_buffer_size = 10_000 + self._mutation_buffer = {} + self._num_mutations = 0 def _bigtable_setup(self, table, options: Dict[str, Any]): From dc676f56887558d08a8707f46bdae8e198dfd293 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 10:30:43 +0200 Subject: [PATCH 464/616] added log to check if mutations are working --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3c3b37ceb..cdfaba2b7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -321,6 +321,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: else: self._mutation_buffer.pop(mutations[i].row_key, None) self._num_mutations -= 1 + self.log.info(f"Committed mutations to BigTableStore for table {self.name}") except Exception as e: self.log.error( From 228f42120934af893cece0859faf7aa80c68c8e2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 10:52:30 +0200 Subject: [PATCH 465/616] refactored if --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cdfaba2b7..07a819e81 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -140,9 +140,8 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._mutation_buffer is not None: - if key in self._mutation_buffer: - return self._mutation_buffer[key][1] + if self._mutation_buffer is not None and key in self._mutation_buffer: + return self._mutation_buffer[key][1] res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: From 8e5016aecde20c67345b7b949a9e2d7f0495894c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 11:46:28 +0200 Subject: [PATCH 466/616] more access to mutation buffer --- faust/stores/bigtable.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 07a819e81..2563b9500 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -140,8 +140,12 @@ def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._mutation_buffer is not None and key in self._mutation_buffer: - return self._mutation_buffer[key][1] + if self._mutation_buffer is not None: + mutation_row, mutation_val = self._mutation_buffer.get( + key, (None, None) + ) + if mutation_row is not None: + return mutation_val res = self.bt_table.read_row(key, filter_=self.row_filter) if res is None: @@ -151,7 +155,12 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: def _bigtable_mutate( self, key: bytes, value: Optional[bytes] ): - row = self.bt_table.direct_row(key) + row = None + if self._mutation_buffer is not None: + row = self._mutation_buffer.get(key, (None, None))[0] + + if row is None: + row = self.bt_table.direct_row(key) if value is None: row.delete() @@ -320,7 +329,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: else: self._mutation_buffer.pop(mutations[i].row_key, None) self._num_mutations -= 1 - self.log.info(f"Committed mutations to BigTableStore for table {self.name}") + self.log.info(f"Committed mutations to BigTableStore for table {self.table.name}") except Exception as e: self.log.error( From b8e69b5484d49d1abe7ca8584c8e215843e7400b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 11:47:23 +0200 Subject: [PATCH 467/616] formatted --- faust/stores/bigtable.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2563b9500..b57d37fa2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -4,7 +4,17 @@ import logging import time import traceback -from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + Optional, + Set, + Tuple, + Union, +) try: # pragma: no cover from google.cloud.bigtable import column_family @@ -107,7 +117,6 @@ def _set_options(self, options) -> None: self._mutation_buffer = {} self._num_mutations = 0 - def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: BT.Client = BT.Client( @@ -152,9 +161,7 @@ def _bigtable_get(self, key: bytes) -> Optional[bytes]: return None return self.bigtable_exrtact_row_data(res) - def _bigtable_mutate( - self, key: bytes, value: Optional[bytes] - ): + def _bigtable_mutate(self, key: bytes, value: Optional[bytes]): row = None if self._mutation_buffer is not None: row = self._mutation_buffer.get(key, (None, None))[0] @@ -325,11 +332,15 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: for i, status in enumerate(response): if status.code != 0: - raise Exception(f"Failed to commit mutation number {i}") + raise Exception( + f"Failed to commit mutation number {i}" + ) else: self._mutation_buffer.pop(mutations[i].row_key, None) self._num_mutations -= 1 - self.log.info(f"Committed mutations to BigTableStore for table {self.table.name}") + self.log.info( + f"Committed mutations to BigTableStore for table {self.table.name}" + ) except Exception as e: self.log.error( From 8728cd1c6db664e19a432df387c18a0a5898add9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 12:27:05 +0200 Subject: [PATCH 468/616] set to 0 manually and clear all mutations --- faust/stores/bigtable.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b57d37fa2..e68905fa6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -335,9 +335,8 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: raise Exception( f"Failed to commit mutation number {i}" ) - else: - self._mutation_buffer.pop(mutations[i].row_key, None) - self._num_mutations -= 1 + self._mutation_buffer = {} + self._num_mutations = 0 self.log.info( f"Committed mutations to BigTableStore for table {self.table.name}" ) From b38c2fdc47cbde057061dd8bb27cc21b1be69dcd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 10 Jul 2023 12:53:31 +0200 Subject: [PATCH 469/616] hardcoded cache --- faust/stores/bigtable.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e68905fa6..e2dc14b6a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -107,12 +107,8 @@ def _set_options(self, options) -> None: BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" ) - if options.get(BigTableStore.BT_VALUE_CACHE_ENABLE_KEY, True): - # TODO - make this configurable - self._cache = LRUCache(limit=10_000) - else: - self._cache = None - + # TODO - make this a configurable option + self._cache = LRUCache(limit=10_000) self._mutation_buffer_size = 10_000 self._mutation_buffer = {} self._num_mutations = 0 From 97c73cdf0b71fcb4746244a7fe1d96ee1c24cb25 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 11 Jul 2023 08:20:20 +0200 Subject: [PATCH 470/616] copy mutation buffer --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e2dc14b6a..15925817f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -323,7 +323,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: self._mutation_buffer is not None and self._num_mutations > self._mutation_buffer_size ): - mutations = [r[0] for r in self._mutation_buffer.values()] + mutations = [r[0] for r in self._mutation_buffer.copy().values()] response = self.bt_table.mutate_rows(mutations) for i, status in enumerate(response): From d29728290b44364f90e58f41a57418bfc61d2984 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 11 Jul 2023 08:21:10 +0200 Subject: [PATCH 471/616] add everything to cache because we can --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 15925817f..c43999c2a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -237,6 +237,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: continue value = self.bigtable_exrtact_row_data(row) + self._cache[row.row_key] = value yield row.row_key, value end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") From 10a52b23cfdda38814850e7b23dba30109754bdc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 12 Jul 2023 11:09:13 +0200 Subject: [PATCH 472/616] Squashed commit of the following: commit 9563aa0699b08854bc1ce3be064a4ee93bc8a729 Author: Johannes Pesenhofer Date: Wed Jul 12 10:58:04 2023 +0200 removed requirement for partitioner commit d16dff31295ed8460edd0fae2f0f1346263d930c Author: Johannes Pesenhofer Date: Wed Jul 12 10:48:38 2023 +0200 fixed wrong ordder in add and remove partition commit 31b271948d40a1542c41f819cdf7022aaa875765 Author: Johannes Pesenhofer Date: Wed Jul 12 10:34:34 2023 +0200 return correct key in iteritems commit 84bfd15ec320c736395ae7ca09ba477940b782fb Author: Johannes Pesenhofer Date: Wed Jul 12 08:27:55 2023 +0200 adjusted read in for changelogs and added not key transform flag to bigtable_del commit 2804a897906766af7322632e8d84717a780f7872 Author: Johannes Pesenhofer Date: Wed Jul 12 08:20:48 2023 +0200 used bigtable set and get without key transform for offsets commit bb052eb0a1e40fa35236d67c94ecb6964544f7f8 Author: Johannes Pesenhofer Date: Wed Jul 12 08:19:28 2023 +0200 added del function and commit 13071d423545286bb333b4d1219d759ac51d3d91 Author: Johannes Pesenhofer Date: Tue Jul 11 15:17:38 2023 +0200 adjusted iteritems. WIP for read and write offset commit a760fceb08a5018d601403d6b63ad2aa9259edde Author: Johannes Pesenhofer Date: Tue Jul 11 15:12:13 2023 +0200 adjusted mutate and get for partitions commit a9cb81a1ec8fb7d4c58a3fba553f5a073066e6fc Author: Johannes Pesenhofer Date: Tue Jul 11 14:48:49 2023 +0200 added first utility functions --- faust/stores/bigtable.py | 198 ++++++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 42 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c43999c2a..c54b163cb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -82,10 +82,6 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - if table.use_partitioner is False: - raise ValueError( - "BigTableStore requires a partitioner to be set on the table" - ) try: self._bigtable_setup(table, options) except Exception as ex: @@ -140,24 +136,108 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): f"with {options=}" ) + def _add_partition_prefix_to_key( + self, key: bytes, partition: Optional[int] + ) -> bytes: + if partition is None: + return key + separator = b"_..._" + partition_bytes = str(partition).encode("utf-8") + return separator.join([partition_bytes, key]) + + def _remove_partition_prefix_from_bigtable_key(self, key: bytes) -> bytes: + separator = b"_..._" + key = key.rsplit(separator, 1)[-1] + return key + + def _get_partition_from_bigtable_key(self, key: bytes) -> int: + separator = b"_..._" + _, partition_bytes = key.rsplit(separator, 1) + return int(partition_bytes) + + def _active_partitions(self) -> Iterator[int]: + actives = self.app.assignor.assigned_actives() + topic = self.table.changelog_topic_name + for partition in range(self.app.conf.topic_partitions): + tp = TP(topic=topic, partition=partition) + # for global tables, keys from all + # partitions are available. + if tp in actives or self.table.is_global: + yield partition + + def _get_all_possible_partitions(self) -> Iterable[Optional[int]]: + if self.table.is_global or self.table.use_partitioner: + return [None] + return list(self._active_partitions()) + + def _get_current_partitions(self) -> Iterable[Optional[int]]: + if self.table.is_global or self.table.use_partitioner: + return [None] + event = current_event() + if event is not None: + partition = event.message.partition + return [partition] + return list(self._active_partitions()) + + def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: + partitions = self._get_current_partitions() + for partition in partitions: + yield self._add_partition_prefix_to_key(key, partition) + @staticmethod def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtable_get(self, key: bytes) -> Optional[bytes]: - if self._mutation_buffer is not None: - mutation_row, mutation_val = self._mutation_buffer.get( - key, (None, None) - ) - if mutation_row is not None: - return mutation_val + def _bigtable_get( + self, key: bytes, no_key_translation=False + ) -> Optional[bytes]: + keys = [key] if no_key_translation else self._get_possible_bt_keys(key) + for bt_key in keys: + if self._mutation_buffer is not None: + mutation_row, mutation_val = self._mutation_buffer.get( + bt_key, (None, None) + ) + if mutation_row is not None: + return mutation_val + + res = self.bt_table.read_row(bt_key, filter_=self.row_filter) + if res is None: + return None + return self.bigtable_exrtact_row_data(res) + + def _set_mutation( + self, key: bytes, row: DirectRow, value: Optional[bytes] + ): + self._mutation_buffer[key] = (row, value) + self._num_mutations += 1 + + def _bigtable_del(self, key: bytes, no_key_translation=False): + if no_key_translation: + keys = [key] + else: + partitions = self._get_all_possible_partitions() + keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] - res = self.bt_table.read_row(key, filter_=self.row_filter) - if res is None: - return None - return self.bigtable_exrtact_row_data(res) + for key in keys: + row = None + if self._mutation_buffer is not None: + row = self._mutation_buffer.get(key, (None, None))[0] + + if row is None: + row = self.bt_table.direct_row(key) - def _bigtable_mutate(self, key: bytes, value: Optional[bytes]): + row.delete() + if self._mutation_buffer is not None: + self._set_mutation(key, row, None) + else: + row.commit() + + def _bigtable_set( + self, key: bytes, value: bytes, no_key_translation=False + ): + keys = [key] if no_key_translation else self._get_possible_bt_keys(key) + assert len(keys) == 1 + key = keys[0] row = None if self._mutation_buffer is not None: row = self._mutation_buffer.get(key, (None, None))[0] @@ -165,18 +245,14 @@ def _bigtable_mutate(self, key: bytes, value: Optional[bytes]): if row is None: row = self.bt_table.direct_row(key) - if value is None: - row.delete() - else: - row.set_cell( - COLUMN_FAMILY_ID, - COLUMN_NAME, - value, - ) + row.set_cell( + COLUMN_FAMILY_ID, + COLUMN_NAME, + value, + ) if self._mutation_buffer is not None: - self._mutation_buffer[key] = (row, value) - self._num_mutations += 1 + self._set_mutation(key, row, value) else: row.commit() @@ -206,7 +282,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: if self._cache is not None: self._cache[key] = value - self._bigtable_mutate(key, value) + self._bigtable_set(key, value) except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -216,29 +292,58 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: raise ex def _del(self, key: bytes) -> None: - self._set(key, None) + try: + if self._cache is not None: + self._cache[key] = None + + self._bigtable_del(key) + except Exception as ex: + self.log.error( + f"FaustBigtableException Error in del for " + f"table {self.table_name} exception {ex} key {key=} " + f"Traceback: {traceback.format_exc()}" + ) + raise ex def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: try: start = time.time() - offset_key = self.offset_key_prefix.encode() - for row in self.bt_table.read_rows(filter_=self.row_filter): - if offset_key in row.row_key: - continue + active_partitions = list(self._active_partitions()) + + row_set = RowSet() + + if not (self.table.is_global or self.table.use_partitioner): + for partition in active_partitions: + row_set.add_row_range_from_keys( + start_key=self._add_partition_prefix_to_key( + b"", partition + ), + end_key=self._add_partition_prefix_to_key( + b"", partition + 1 + ), + ) + for row in self.bt_table.read_rows( + row_set=row_set, filter_=self.row_filter + ): if self._mutation_buffer is not None: + # Yield the mutation first if it exists mutation_row, mutation_val = self._mutation_buffer.get( row.row_key, (None, None) ) if mutation_val is not None: - yield row.row_key, mutation_val + key = self._remove_partition_prefix_from_bigtable_key(row.row_key) + yield key, mutation_val if mutation_row is not None: continue value = self.bigtable_exrtact_row_data(row) - self._cache[row.row_key] = value - yield row.row_key, value + key = self._remove_partition_prefix_from_bigtable_key(row.row_key) + if self._cache is not None: + self._cache[key] = value + yield key, value + end = time.time() self.log.info(f"{self.table_name} _iteritems took {end - start}s") except Exception as ex: @@ -305,7 +410,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset = self._get(offset_key) + offset = self._bigtable_get(offset_key, no_key_translation=True) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -318,13 +423,17 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - self._set(offset_key, str(offset).encode()) + self._bigtable_set( + offset_key, str(offset).encode(), no_key_translation=True + ) if ( self._mutation_buffer is not None and self._num_mutations > self._mutation_buffer_size ): - mutations = [r[0] for r in self._mutation_buffer.copy().values()] + mutations = [ + r[0] for r in self._mutation_buffer.copy().values() + ] response = self.bt_table.mutate_rows(mutations) for i, status in enumerate(response): @@ -333,10 +442,10 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: f"Failed to commit mutation number {i}" ) self._mutation_buffer = {} - self._num_mutations = 0 self.log.info( - f"Committed mutations to BigTableStore for table {self.table.name}" + f"Committed {self._num_mutations} mutations to BigTableStore for table {self.table.name}" ) + self._num_mutations = 0 except Exception as e: self.log.error( @@ -367,10 +476,15 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message + if not (self.table.is_global or self.table.use_partitioner): + key = self._add_partition_prefix_to_key(msg.key, tp.partition) + else: + key = msg.key + if msg.value is None: - self._del(msg.key) + self._bigtable_del(msg.key, no_key_translation=True) else: - self._set(msg.key, msg.value) + self._bigtable_set(msg.key, msg.value, no_key_translation=True) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From eeaa371b89fb3d09ef03dcc6692a91c708f47b36 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 12 Jul 2023 12:04:05 +0200 Subject: [PATCH 473/616] fixed rowset --- faust/stores/bigtable.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c54b163cb..151ebb989 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -311,16 +311,14 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: active_partitions = list(self._active_partitions()) row_set = RowSet() - if not (self.table.is_global or self.table.use_partitioner): for partition in active_partitions: + prefix = self._add_partition_prefix_to_key(b"", partition) + start_key = prefix + b"\x00" + end_key = prefix + b"\xff" + row_set.add_row_range_from_keys( - start_key=self._add_partition_prefix_to_key( - b"", partition - ), - end_key=self._add_partition_prefix_to_key( - b"", partition + 1 - ), + start_key=start_key, end_key=end_key ) for row in self.bt_table.read_rows( From 776eb6624ddf17da0862419e7f45eec1e208d5dc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 12 Jul 2023 12:25:17 +0200 Subject: [PATCH 474/616] fixed wrong iteration number --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 151ebb989..503720b91 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -235,7 +235,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): def _bigtable_set( self, key: bytes, value: bytes, no_key_translation=False ): - keys = [key] if no_key_translation else self._get_possible_bt_keys(key) + keys = [key] if no_key_translation else list(self._get_possible_bt_keys(key)) assert len(keys) == 1 key = keys[0] row = None From bbda9085b1e238ef25816b51bf12eecf037a4247 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 18 Jul 2023 08:59:41 +0200 Subject: [PATCH 475/616] fixed apply changelog in bigtable --- faust/stores/bigtable.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 503720b91..e0f76c5ff 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -260,6 +260,7 @@ def _get(self, key: bytes) -> Optional[bytes]: try: if self._cache is not None: if key in self._cache: + self.log.info(f"Found value for key in cache {key=} {value=}") return self._cache.get(key) value = self._bigtable_get(key) @@ -480,9 +481,9 @@ def apply_changelog_batch( key = msg.key if msg.value is None: - self._bigtable_del(msg.key, no_key_translation=True) + self._bigtable_del(msg.key) else: - self._bigtable_set(msg.key, msg.value, no_key_translation=True) + self._bigtable_set(msg.key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From afff96d619a231fa309ed31da49765e33ded6aa0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 13:25:50 +0200 Subject: [PATCH 476/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20bigtable=20delet?= =?UTF-8?q?e=20testcase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 160 +---------------------------- 1 file changed, 4 insertions(+), 156 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 82d898f95..45643eeed 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -2,11 +2,9 @@ from unittest.mock import MagicMock, call, patch import pytest -from mode.utils.collections import LRUCache import faust from faust.stores.bigtable import ( - BigTableCache, BigTableStore, ) from faust.types.tuples import TP @@ -116,157 +114,6 @@ def add_test_data(self, keys): self.data[k] = k -class TestBigTableCache: - def test_default__init__(self): - bigtable_mock = BigTableMock() - app_mock = MagicMock() - app_mock.conf = MagicMock() - app_mock.conf.table_key_index_size = 123 - time.time = MagicMock(return_value=0) - - test_manager = BigTableCache(MagicMock(), {}, bigtable_mock) - assert test_manager.bt_table == bigtable_mock - assert test_manager._value_cache is None - - def test_iscomplete__init__(self): - bigtable_mock = BigTableMock() - app_mock = MagicMock() - app_mock.conf = MagicMock() - app_mock.conf.table_key_index_size = 2 - time.time = MagicMock(return_value=0) - options = { - BigTableStore.BT_VALUE_CACHE_ENABLE_KEY: True, - } - - test_manager = BigTableCache( - MagicMock(), options, bigtable_mock - ) - assert test_manager.bt_table == bigtable_mock - assert isinstance(test_manager._value_cache, dict) - - @pytest.fixture() - def bt_imports(self): - with patch("faust.stores.bigtable.BT") as bt: - bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock( - return_value="a_rule" - ) - bt.RowSet = MagicMock(return_value=RowSetMock()) - yield bt - - @pytest.fixture() - def manager(self, bt_imports): - with patch("faust.stores.bigtable.BT", bt_imports): - with patch( - "faust.stores.bigtable.time.time", MagicMock(return_value=0) - ): - bigtable_mock = BigTableMock() - app_mock = MagicMock() - app_mock.conf = MagicMock() - app_mock.conf.table_key_index_size = 123 - - options = { - BigTableStore.BT_VALUE_CACHE_ENABLE_KEY: True, - } - manager = BigTableCache( - MagicMock(), options, bigtable_mock - ) - manager._partition_cache = {} - return manager - - def test_fill(self, manager): - key = b"\x13AAA" - manager.bt_table.add_test_data({key}) - # Scenario 1: Everything empty - manager.fill({19}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager.filled_partitions == {19} - - manager.fill({19}) - assert manager.bt_table.read_rows.call_count == 1 - assert manager.filled_partitions == {19} - - manager.fill({16}) - assert manager.bt_table.read_rows.call_count == 2 - assert manager.filled_partitions == {19, 16} - assert manager.contains(key) - - def test_get(self, manager): - key_in = b"\x13AAA" - key_not_in = b"\x13BBB" - - manager.bt_table.add_test_data({key_in}) - with pytest.raises(KeyError): - manager.get(key_in) - - manager.fill({19}) - res = manager.get(key_in) - assert res == key_in - - with pytest.raises(KeyError): - manager.get(key_not_in) - - manager._value_cache = None - assert manager.get(key_in) is None - - def test_set(self, manager): - key_1 = b"\x13AAA" - key_2 = b"\x13ABB" - manager.set(key_1, key_1) - assert manager.contains(key_1) - assert manager.contains(key_2) is False - - manager.set(key_2, key_2) - assert manager.contains(key_1) - assert manager.contains(key_2) - assert manager.get(key_1) == key_1 - assert manager.get(key_2) == key_2 - - def test_partition_cache(self, manager): - key = b"aaa" - with pytest.raises(KeyError): - manager.get_partition(key) - manager.set_partition(key, 13) - assert manager.get_partition(key) == 13 - manager.set_partition(key, 15) - assert manager.get_partition(key) == 15 - - def test_contains(self, manager): - # Adding the key here is sufficient, because the cache gets filled - key_in = b"\x13AAA" - key_not_in = b"\x13BBB" - manager.bt_table.add_test_data({key_in}) - manager.fill({19}) - - assert manager.contains(key_in) is True - assert manager.contains(key_not_in) is False - - assert manager.contains(key_in) is True - assert manager.contains(key_not_in) is False - - manager._value_cache = None - assert manager.contains(key_in) is False - assert manager.contains(key_not_in) is False - - def test_delete_partition(self, manager): - partition = 19 - row_mock = MagicMock() - row_mock.delete = MagicMock() - row_mock.set_cell = MagicMock() - row_mock.row_key = b"\x13AAA" - manager.bt_table.direct_row = MagicMock(return_value=row_mock) - manager.bt_table.add_test_data({b"\x13AAA"}) - manager.set(row_mock.row_key, row_mock) - manager.set_partition(row_mock.row_key[1:], partition) - manager.delete_partition(3) - assert len(manager._value_cache) == 1 - assert len(manager._partition_cache) == 1 - manager.delete_partition(partition) - assert len(manager._value_cache) == 0 - assert len(manager._partition_cache) == 0 - # Delete something that does not exist yet should not do anything - manager.delete_partition(999999) - class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" @@ -409,10 +256,11 @@ def test_bigtable_delete(self, store): row_mock.commit = MagicMock() row_mock.delete = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._cache.set = MagicMock() + store._set_mutation = MagicMock() - store._bigtable_mutate(self.TEST_KEY1, None) - store._cache.set.assert_called_once_with(self.TEST_KEY1, None) + import pdb; pdb.set_trace() + store._bigtable_del(self.TEST_KEY1, no_key_translation=True) + store._set_mutation.assert_called_once_with(self.TEST_KEY1, row_mock, None) def test_bigtable_set(self, store): row_mock = MagicMock() From 39c7094312cc64743a2edd6ff9e382d66e945e73 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 13:26:18 +0200 Subject: [PATCH 477/616] removed pdb --- tests/unit/stores/test_bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 45643eeed..ef7414d25 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -258,7 +258,6 @@ def test_bigtable_delete(self, store): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._set_mutation = MagicMock() - import pdb; pdb.set_trace() store._bigtable_del(self.TEST_KEY1, no_key_translation=True) store._set_mutation.assert_called_once_with(self.TEST_KEY1, row_mock, None) From 4a06e147703f1d52e1cfa2a61814e52a5dc9fc7c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 13:28:39 +0200 Subject: [PATCH 478/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20bigtable=20set?= =?UTF-8?q?=20test=20and=20formating?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index ef7414d25..c84a7240a 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -114,7 +114,6 @@ def add_test_data(self, keys): self.data[k] = k - class TestBigTableStore: TEST_KEY1 = b"TEST_KEY1" TEST_KEY2 = b"TEST_KEY2" @@ -259,21 +258,26 @@ def test_bigtable_delete(self, store): store._set_mutation = MagicMock() store._bigtable_del(self.TEST_KEY1, no_key_translation=True) - store._set_mutation.assert_called_once_with(self.TEST_KEY1, row_mock, None) + store._set_mutation.assert_called_once_with( + self.TEST_KEY1, row_mock, None + ) def test_bigtable_set(self, store): row_mock = MagicMock() row_mock.set_cell = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._cache.set = MagicMock(return_value=None) - store._cache.submit_mutation = MagicMock(return_value=None) - store._bigtable_mutate(self.TEST_KEY1, self.TEST_KEY1) - store._bigtable_mutate(self.TEST_KEY1, self.TEST_KEY1) - - store._cache.set.assert_called_with(self.TEST_KEY1, self.TEST_KEY1) - store._cache.submit_mutation.assert_called_with(self.TEST_KEY1, self.TEST_KEY1) + store._set_mutation = MagicMock(return_value=None) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True + ) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True + ) + store._set_mutation.assert_called_with( + self.TEST_KEY1, row_mock, self.TEST_KEY1 + ) def test_maybe_get_partition_from_message(self, store): event_mock = MagicMock() @@ -374,9 +378,7 @@ def test_get_with_unknown_partition(self, store): keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 19)) # Scenario: Found - store._bigtable_get = MagicMock( - return_value=b"a_value" - ) + store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, 19) @@ -486,7 +488,9 @@ def test_get_offset_key(self, store): def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) - store._cache.submit_mutation = MagicMock(wraps=store._cache.submit_mutation) + store._cache.submit_mutation = MagicMock( + wraps=store._cache.submit_mutation + ) store._cache.flush_if_timer_over = MagicMock(return_value=False) expected_offset_key = store.get_offset_key(tp).encode() store.set_persisted_offset(tp, 123) @@ -568,7 +572,9 @@ def assert_offset_persisted(offset): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) - store._cache.submit_mutation = MagicMock(wraps=store._cache.submit_mutation) + store._cache.submit_mutation = MagicMock( + wraps=store._cache.submit_mutation + ) partition = 0 faust.stores.bigtable.get_current_partition = MagicMock( From 5a0edcf44ec3833465a101903e87332a21ad2786 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 13:30:05 +0200 Subject: [PATCH 479/616] =?UTF-8?q?=E2=9C=A8=20formating=20of=20bigtable?= =?UTF-8?q?=20store?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e0f76c5ff..3ae1f51ea 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -216,7 +216,9 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): keys = [key] else: partitions = self._get_all_possible_partitions() - keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] + keys = [ + self._add_partition_prefix_to_key(key, p) for p in partitions + ] for key in keys: row = None @@ -235,7 +237,11 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): def _bigtable_set( self, key: bytes, value: bytes, no_key_translation=False ): - keys = [key] if no_key_translation else list(self._get_possible_bt_keys(key)) + keys = ( + [key] + if no_key_translation + else list(self._get_possible_bt_keys(key)) + ) assert len(keys) == 1 key = keys[0] row = None @@ -260,7 +266,9 @@ def _get(self, key: bytes) -> Optional[bytes]: try: if self._cache is not None: if key in self._cache: - self.log.info(f"Found value for key in cache {key=} {value=}") + self.log.info( + f"Found value for key in cache {key=} {value=}" + ) return self._cache.get(key) value = self._bigtable_get(key) @@ -331,14 +339,18 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row.row_key, (None, None) ) if mutation_val is not None: - key = self._remove_partition_prefix_from_bigtable_key(row.row_key) + key = self._remove_partition_prefix_from_bigtable_key( + row.row_key + ) yield key, mutation_val if mutation_row is not None: continue value = self.bigtable_exrtact_row_data(row) - key = self._remove_partition_prefix_from_bigtable_key(row.row_key) + key = self._remove_partition_prefix_from_bigtable_key( + row.row_key + ) if self._cache is not None: self._cache[key] = value yield key, value From 30d51ace5d4b4e32372e8ec472de2443586edaf3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:07:04 +0200 Subject: [PATCH 480/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20testcase=20for?= =?UTF-8?q?=20get=20current=20partitions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index c84a7240a..7032fbfee 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -279,7 +279,7 @@ def test_bigtable_set(self, store): self.TEST_KEY1, row_mock, self.TEST_KEY1 ) - def test_maybe_get_partition_from_message(self, store): + def test_get_partition_from_message(self, store): event_mock = MagicMock() event_mock.message = MagicMock() event_mock.message.partition = 69 @@ -288,19 +288,23 @@ def test_maybe_get_partition_from_message(self, store): store.table.is_global = False store.table.use_partitioner = False with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._maybe_get_partition_from_message() - assert return_value == 69 + return_value = store._get_current_partitions() + assert return_value == [69] store.table.is_global = True with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._maybe_get_partition_from_message() - assert return_value is None + return_value = store._get_current_partitions() + assert return_value == [None] store.table.is_global = False current_event_mock = MagicMock(return_value=None) + + topic = store.table.changelog_topic_name + store.app.assignor.assigned_actives = MagicMock(return_value={TP(topic, 420)}) + store.app.conf.topic_partitions = 421 with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._maybe_get_partition_from_message() - assert return_value is None + return_value = store._get_current_partitions() + assert return_value == [420] def test_get_partition_prefix(self, store): partition = 0 From 0667b905e11de81eed74632cbe969af9fb2efaa4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:17:22 +0200 Subject: [PATCH 481/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20bug=20with=20wro?= =?UTF-8?q?ng=20removal=20of=20partition?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 6 +++--- tests/unit/stores/test_bigtable.py | 20 ++++++-------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3ae1f51ea..d28c96772 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -152,7 +152,7 @@ def _remove_partition_prefix_from_bigtable_key(self, key: bytes) -> bytes: def _get_partition_from_bigtable_key(self, key: bytes) -> int: separator = b"_..._" - _, partition_bytes = key.rsplit(separator, 1) + partition_bytes, _ = key.rsplit(separator, 1) return int(partition_bytes) def _active_partitions(self) -> Iterator[int]: @@ -493,9 +493,9 @@ def apply_changelog_batch( key = msg.key if msg.value is None: - self._bigtable_del(msg.key) + self._bigtable_del(key, no_key_translation=True) else: - self._bigtable_set(msg.key, msg.value) + self._bigtable_set(key, msg.value, no_key_translation=True) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 7032fbfee..2516053da 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -306,25 +306,17 @@ def test_get_partition_from_message(self, store): return_value = store._get_current_partitions() assert return_value == [420] - def test_get_partition_prefix(self, store): - partition = 0 - res = store._get_partition_prefix(partition) - assert res[0] == partition - - partition = 19 - res = store._get_partition_prefix(partition) - assert res[0] == partition - def test_get_faust_key(self, store): - key_with_partition = b"\x13THEACTUALKEY" - res = store._get_faust_key(key_with_partition) + key_with_partition = b"\x13_..._THEACTUALKEY" + res = store._remove_partition_prefix_from_bigtable_key(key_with_partition) assert res == b"THEACTUALKEY" def test_get_key_with_partition(self, store): partition = 19 - res = store._get_bigtable_key(self.TEST_KEY1, partition) - assert res[0] == partition - assert store._get_faust_key(res) == self.TEST_KEY1 + res = store._add_partition_prefix_to_key(self.TEST_KEY1, partition) + extracted_partition = store._get_partition_from_bigtable_key(res) + assert extracted_partition == partition + assert store._remove_partition_prefix_from_bigtable_key(res) == self.TEST_KEY1 def test_partitions_for_key(self, store): store._cache.get_partition = MagicMock(return_value=19) From 7607eac72fe3d3bc16a0fd0a7388fa6db822ac2b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:36:50 +0200 Subject: [PATCH 482/616] =?UTF-8?q?=F0=9F=A9=B9=20fix=20test=20get=20parti?= =?UTF-8?q?tions=20for=20current=20key?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2516053da..4b066c8c7 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -319,16 +319,9 @@ def test_get_key_with_partition(self, store): assert store._remove_partition_prefix_from_bigtable_key(res) == self.TEST_KEY1 def test_partitions_for_key(self, store): - store._cache.get_partition = MagicMock(return_value=19) - res = store._partitions_for_key(self.TEST_KEY1) - store._cache.get_partition.assert_called_once_with(self.TEST_KEY1) - assert res == [19] - - store._cache.get_partition = MagicMock(side_effect=KeyError) - store._active_partitions = MagicMock(return_value=[1, 2, 3]) - res = store._partitions_for_key(self.TEST_KEY2) - store._cache.get_partition.assert_called_once_with(self.TEST_KEY2) - assert res == [1, 2, 3] + store._get_current_partitions = MagicMock(return_value=[19]) + res = list(store._get_possible_bt_keys(self.TEST_KEY1)) + assert res == [store._add_partition_prefix_to_key(self.TEST_KEY1, 19)] def test_get_keyerror(self, store): partition = 19 From 224b3a24e5f96adaef20d942e0239775b0b60541 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:47:24 +0200 Subject: [PATCH 483/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20get=20testcases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 38 ++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 4b066c8c7..88c716b44 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -325,8 +325,8 @@ def test_partitions_for_key(self, store): def test_get_keyerror(self, store): partition = 19 - store._maybe_get_partition_from_message = MagicMock( - return_value=partition + store._get_current_partitions = MagicMock( + return_value=[partition] ) store._bigtable_get = MagicMock(return_value=None) with pytest.raises(KeyError): @@ -334,29 +334,43 @@ def test_get_keyerror(self, store): def test_get_with_known_partition(self, store): partition = 19 - store._maybe_get_partition_from_message = MagicMock( - return_value=partition + store._cache = None + store._get_current_partitions = MagicMock( + return_value=[partition] ) - store._cache.set_partition = MagicMock() # Scenario: Found store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) - key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) - store._bigtable_get.assert_called_once_with(key_with_partition) - store._cache.set_partition.assert_called_once_with( + key_with_partition = store._add_partition_prefix_to_key( self.TEST_KEY1, partition ) + store._bigtable_get.assert_called_once_with(self.TEST_KEY1) assert res == b"a_value" - store._cache.set_partition.reset_mock() # Scenario: Not Found store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY1) - key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) - store._bigtable_get.assert_called_once_with(key_with_partition) - store._cache.set_partition.assert_not_called() + store._bigtable_get.assert_called_once_with(self.TEST_KEY1) assert res is None + # Scenario: Cache hit on value + store._bigtable_get = MagicMock(return_value=None) + store._cache = {self.TEST_KEY1: b"a_value_from_cache"} + res = store._get(self.TEST_KEY1) + store._bigtable_get.assert_not_called() + res2 = store._get(self.TEST_KEY2) + assert res == b"a_value_from_cache" + store._bigtable_get.assert_called_once_with(self.TEST_KEY2) + assert store._cache[self.TEST_KEY2] is None + assert res2 is None + + # Scenario: Cache hit on None value + store._bigtable_get = MagicMock(return_value=None) + res = store._get(self.TEST_KEY2) + store._bigtable_get.assert_not_called() + assert res is None + + def test_get_with_unknown_partition(self, store): store._maybe_get_partition_from_message = MagicMock(return_value=None) store._partitions_for_key = MagicMock(return_value=[3, 19]) From 8b7af4c87be6d9a4772ac3348943b65534617031 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:47:43 +0200 Subject: [PATCH 484/616] =?UTF-8?q?=F0=9F=9A=AE=20delete=20partition=20stu?= =?UTF-8?q?ff=20from=20get=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 88c716b44..f74aafe1b 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -371,30 +371,6 @@ def test_get_with_known_partition(self, store): assert res is None - def test_get_with_unknown_partition(self, store): - store._maybe_get_partition_from_message = MagicMock(return_value=None) - store._partitions_for_key = MagicMock(return_value=[3, 19]) - store._cache.set_partition = MagicMock() - keys_searched = set() - keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 1)) - keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 3)) - keys_searched.add(store._get_bigtable_key(self.TEST_KEY1, 19)) - - # Scenario: Found - store._bigtable_get = MagicMock(return_value=b"a_value") - res = store._get(self.TEST_KEY1) - store._partitions_for_key.assert_called_once_with(self.TEST_KEY1) - store._cache.set_partition.assert_called_once_with(self.TEST_KEY1, 19) - assert res == b"a_value" - - store._cache.set_partition.reset_mock() - # Scenario: Not Found - store._bigtable_get = MagicMock(return_value=None) - res = store._get(self.TEST_KEY1) - assert store._bigtable_get.call_count == 2 - store._cache.set_partition.assert_not_called() - assert res is None - def test_set(self, store): partition = 19 faust.stores.bigtable.get_current_partition = MagicMock( From 3ebb53d35070ac1887bf0fed326117285663eb73 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 14:58:02 +0200 Subject: [PATCH 485/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20testcases=20for?= =?UTF-8?q?=20set=20and=20del?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 44 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index f74aafe1b..d753cdeaf 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -370,35 +370,35 @@ def test_get_with_known_partition(self, store): store._bigtable_get.assert_not_called() assert res is None - def test_set(self, store): - partition = 19 - faust.stores.bigtable.get_current_partition = MagicMock( - return_value=partition - ) - store._bigtable_mutate = MagicMock() - store._cache.set_partition = MagicMock() + # Scenario: No cache + store._cache = None + store._bigtable_set = MagicMock() store._set(self.TEST_KEY1, b"a_value") - key_with_partition = store._get_bigtable_key(self.TEST_KEY1, partition) - store._bigtable_mutate.assert_called_once_with( - key_with_partition, b"a_value" + store._bigtable_set.assert_called_once_with( + self.TEST_KEY1, b"a_value" ) - store._cache.set_partition.assert_called_once_with( - self.TEST_KEY1, partition + + # Scenario: Cache active + store._cache = {} + store._set(self.TEST_KEY1, b"b_value") + assert store._cache[self.TEST_KEY1] == b"b_value" + store._bigtable_set.assert_called_with( + self.TEST_KEY1, b"b_value" ) def test_del(self, store): - store._cache._partition_cache = {self.TEST_KEY1: 19} - store._partitions_for_key = MagicMock(return_value=[1, 3, 19]) - store._bigtable_mutate = MagicMock() + # Scenario: No cache + store._cache = None + store._bigtable_del = MagicMock() store._del(self.TEST_KEY1) - calls = [ - call(store._get_bigtable_key(self.TEST_KEY1, 1), None), - call(store._get_bigtable_key(self.TEST_KEY1, 3), None), - call(store._get_bigtable_key(self.TEST_KEY1, 19), None), - ] - store._bigtable_mutate.assert_has_calls(calls) - assert store._cache._partition_cache == {} + store._bigtable_del.assert_called_once_with(self.TEST_KEY1) + + # Scenario: Cache active + store._cache = {} + store._del(self.TEST_KEY1) + assert store._cache[self.TEST_KEY1] is None + store._bigtable_del.assert_called_with(self.TEST_KEY1) def test_active_partitions(self, store): active_topics = [ From ea1bb1da5161b4f29494ff104962d504f63b331f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 15:25:57 +0200 Subject: [PATCH 486/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20iteritems=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 14 +++--- tests/unit/stores/test_bigtable.py | 77 ++++++++++++++++++------------ 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d28c96772..beb5a15ad 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -266,9 +266,6 @@ def _get(self, key: bytes) -> Optional[bytes]: try: if self._cache is not None: if key in self._cache: - self.log.info( - f"Found value for key in cache {key=} {value=}" - ) return self._cache.get(key) value = self._bigtable_get(key) @@ -338,13 +335,18 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: mutation_row, mutation_val = self._mutation_buffer.get( row.row_key, (None, None) ) + key = self._remove_partition_prefix_from_bigtable_key( + row.row_key + ) if mutation_val is not None: - key = self._remove_partition_prefix_from_bigtable_key( - row.row_key - ) + if self._cache is not None: + self._cache[key] = mutation_val yield key, mutation_val + continue if mutation_row is not None: + if self._cache is not None: + self._cache[key] = None continue value = self.bigtable_exrtact_row_data(row) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index d753cdeaf..2d2cfb676 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -300,7 +300,9 @@ def test_get_partition_from_message(self, store): current_event_mock = MagicMock(return_value=None) topic = store.table.changelog_topic_name - store.app.assignor.assigned_actives = MagicMock(return_value={TP(topic, 420)}) + store.app.assignor.assigned_actives = MagicMock( + return_value={TP(topic, 420)} + ) store.app.conf.topic_partitions = 421 with patch("faust.stores.bigtable.current_event", current_event_mock): return_value = store._get_current_partitions() @@ -308,7 +310,9 @@ def test_get_partition_from_message(self, store): def test_get_faust_key(self, store): key_with_partition = b"\x13_..._THEACTUALKEY" - res = store._remove_partition_prefix_from_bigtable_key(key_with_partition) + res = store._remove_partition_prefix_from_bigtable_key( + key_with_partition + ) assert res == b"THEACTUALKEY" def test_get_key_with_partition(self, store): @@ -316,7 +320,10 @@ def test_get_key_with_partition(self, store): res = store._add_partition_prefix_to_key(self.TEST_KEY1, partition) extracted_partition = store._get_partition_from_bigtable_key(res) assert extracted_partition == partition - assert store._remove_partition_prefix_from_bigtable_key(res) == self.TEST_KEY1 + assert ( + store._remove_partition_prefix_from_bigtable_key(res) + == self.TEST_KEY1 + ) def test_partitions_for_key(self, store): store._get_current_partitions = MagicMock(return_value=[19]) @@ -325,9 +332,7 @@ def test_partitions_for_key(self, store): def test_get_keyerror(self, store): partition = 19 - store._get_current_partitions = MagicMock( - return_value=[partition] - ) + store._get_current_partitions = MagicMock(return_value=[partition]) store._bigtable_get = MagicMock(return_value=None) with pytest.raises(KeyError): store[self.TEST_KEY1.decode()] @@ -335,9 +340,7 @@ def test_get_keyerror(self, store): def test_get_with_known_partition(self, store): partition = 19 store._cache = None - store._get_current_partitions = MagicMock( - return_value=[partition] - ) + store._get_current_partitions = MagicMock(return_value=[partition]) # Scenario: Found store._bigtable_get = MagicMock(return_value=b"a_value") res = store._get(self.TEST_KEY1) @@ -375,17 +378,13 @@ def test_set(self, store): store._cache = None store._bigtable_set = MagicMock() store._set(self.TEST_KEY1, b"a_value") - store._bigtable_set.assert_called_once_with( - self.TEST_KEY1, b"a_value" - ) + store._bigtable_set.assert_called_once_with(self.TEST_KEY1, b"a_value") # Scenario: Cache active store._cache = {} store._set(self.TEST_KEY1, b"b_value") assert store._cache[self.TEST_KEY1] == b"b_value" - store._bigtable_set.assert_called_with( - self.TEST_KEY1, b"b_value" - ) + store._bigtable_set.assert_called_with(self.TEST_KEY1, b"b_value") def test_del(self, store): # Scenario: No cache @@ -423,28 +422,44 @@ def test_active_partitions(self, store): all_res = list(res) assert list(range(store.app.conf.topic_partitions)) == all_res - def test_iteritems_with_cache(self, store): + def test_iteritems(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) - store._cache.flush = MagicMock(wraps=store._cache.flush) - store._cache.fill = MagicMock() store.bt_table.read_rows = MagicMock() + store._mutation_buffer = None + store._cache = {} _ = sorted(store._iteritems()) - store._cache.flush.assert_called_once() - store._cache.fill.assert_called_once() - _ = sorted(store._iteritems()) - store.bt_table.read_rows.assert_not_called() + store.bt_table.read_rows.assert_called_once() - def test_iteritems(self, store): + def test_iteritems_with_mutations(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) - store.bt_table.read_rows = MagicMock() - store._cache._value_cache = None - store._cache.flush = MagicMock(wraps=store._cache.flush) - - _ = sorted(store._iteritems()) - store._cache.flush.assert_called_once() - _ = sorted(store._iteritems()) - store.bt_table.read_rows.assert_called() + store._mutation_buffer = { + self.TEST_KEY1: ("doesn't matter", b"a_value"), + self.TEST_KEY2: ("doesn't matter", None), + } + store.bt_table.read_rows = MagicMock( + return_value=[ + MagicMock( + row_key=self.TEST_KEY1, + to_dict=MagicMock( + return_value={"x": [MagicMock(value=b"1")]} + ), + commit=MagicMock(), + ), + MagicMock( + row_key=self.TEST_KEY2, + to_dict=MagicMock( + return_value={"x": [MagicMock(value=b"this is overwritten")]} + ), + commit=MagicMock(), + ), + ] + ) + res = sorted(store._iteritems()) + store.bt_table.read_rows.assert_called_once() + assert res == [(self.TEST_KEY1, b"a_value")] + assert store._cache.get(self.TEST_KEY1) == b"a_value" + assert store._cache.get(self.TEST_KEY2) is None def test_iterkeys(self, store): values = [("K1", "V1"), ("K2", "V2")] From 9dd31f998e1940c9e94b9de3810713f4f873fc58 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 15:58:56 +0200 Subject: [PATCH 487/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20testcase=20for?= =?UTF-8?q?=20inserting=20new=20offset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 37 +++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2d2cfb676..043b7bbad 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -482,15 +482,40 @@ def test_get_offset_key(self, store): def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) - store._cache.submit_mutation = MagicMock( - wraps=store._cache.submit_mutation - ) - store._cache.flush_if_timer_over = MagicMock(return_value=False) expected_offset_key = store.get_offset_key(tp).encode() + store._bigtable_set = MagicMock() + store.bt_table.mutate_rows = MagicMock() + store._mutation_buffer = None + + store.set_persisted_offset(tp, 123) + store._bigtable_set.called_once_with(expected_offset_key, b"123", no_key_translation=True) + store.bt_table.mutate_rows.assert_not_called() + + store._bigtable_set = MagicMock() + store._mutation_buffer = {} + store._mutation_size = 0 + store.set_persisted_offset(tp, 123) + store._bigtable_set.assert_not_called() + store.bt_table.mutate_rows.assert_not_called() + + store._bigtable_set = MagicMock() + store._mutation_buffer = { + + self.TEST_KEY1: ("doesn't matter", b"a_value"), + self.TEST_KEY2: ("doesn't matter", None), + self.TEST_KEY3: ("doesn't matter", b"c_value"), + } + mutations = [ + r[0] for r in store._mutation_buffer.copy().values() + ] + store._num_mutations = 999999999999999999999999999999999999999999 store.set_persisted_offset(tp, 123) - store._cache.submit_mutation.assert_called_with( - expected_offset_key, str(123).encode() + store._bigtable_set.called_once_with( + expected_offset_key, + b"123", + no_key_translation=True ) + store.bt_table.mutate_rows.assert_called_once_with(mutations) def test_apply_changelog_batch(self, store): row_mock = MagicMock() From 805b86ac2bedc9ba0d6cbc7ccba9267f58783e6c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Sep 2023 15:59:25 +0200 Subject: [PATCH 488/616] =?UTF-8?q?=E2=9C=A8=20refactored=20set=5Fpersited?= =?UTF-8?q?=5Foffset=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 58 +++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index beb5a15ad..9fead3f8a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -426,6 +426,34 @@ def persisted_offset(self, tp: TP) -> Optional[int]: offset = self._bigtable_get(offset_key, no_key_translation=True) return int(offset) if offset is not None else None + def _flush_mutation_buffer(self, offset: int, offset_key): + mutations = [ + r[0] for r in self._mutation_buffer.copy().values() + ] + response = self.bt_table.mutate_rows(mutations) + + for i, status in enumerate(response): + if status.code != 0: + raise Exception( + f"Failed to commit mutation number {i}" + ) + + self._mutation_buffer = {} + self.log.info( + f"Committed {self._num_mutations} mutations to BigTableStore for table {self.table.name}" + ) + self._bigtable_set( + offset_key, str(offset).encode(), no_key_translation=True + ) + self._num_mutations = 0 + + + def _should_flush_mutations(self) -> bool: + return ( + self._mutation_buffer is not None + and self._num_mutations > self._mutation_buffer_size + ) + def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. @@ -436,31 +464,13 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), no_key_translation=True - ) - - if ( - self._mutation_buffer is not None - and self._num_mutations > self._mutation_buffer_size - ): - mutations = [ - r[0] for r in self._mutation_buffer.copy().values() - ] - response = self.bt_table.mutate_rows(mutations) - - for i, status in enumerate(response): - if status.code != 0: - raise Exception( - f"Failed to commit mutation number {i}" - ) - self._mutation_buffer = {} - self.log.info( - f"Committed {self._num_mutations} mutations to BigTableStore for table {self.table.name}" + if self._should_flush_mutations(): + self._flush_mutation_buffer(offset, offset_key) + elif self._mutation_buffer is None: + self._bigtable_set( + offset_key, str(offset).encode(), no_key_translation=True ) - self._num_mutations = 0 - - except Exception as e: + except Exception: self.log.error( f"Failed to commit offset for {self.table.name}" " -> will cause additional changelogs if restart happens" From f7b43e24fa5b270dffd3552f49f14aeaa1d79bd8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 22 Sep 2023 15:14:43 +0200 Subject: [PATCH 489/616] =?UTF-8?q?=F0=9F=A9=B9=20fixed=20apply=20changelo?= =?UTF-8?q?g=20batch=20in=20BT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/stores/test_bigtable.py | 85 ++---------------------------- 1 file changed, 4 insertions(+), 81 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 043b7bbad..ccd46b31e 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -522,8 +522,8 @@ def test_apply_changelog_batch(self, store): row_mock.delete = MagicMock() row_mock.set_cell = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) - store.bt_table.mutate_rows = MagicMock() - store._bigtable_mutate = MagicMock() + store._bigtable_del = MagicMock() + store._bigtable_set = MagicMock() store.set_persisted_offset = MagicMock() store._cache.submit_mutation = MagicMock() store._cache.set = MagicMock() @@ -549,83 +549,6 @@ def __init__(self, message): TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), ] store.apply_changelog_batch(messages, lambda x: x, lambda x: x) - assert store._bigtable_mutate.call_count == 5 + assert store._bigtable_set.call_count == 4 + assert store._bigtable_del.call_count == 1 assert store.set_persisted_offset.call_count == 2 - - def test_revoke_partitions(self, store): - store._cache.delete_partition = MagicMock() - TP1 = MagicMock() - TP1.partition = 1 - TP2 = MagicMock() - TP2.partition = 2 - - store.revoke_partitions({TP1, TP2}) - store._cache.delete_partition.assert_any_call(1) - store._cache.delete_partition.assert_any_call(2) - - def test_mutation_flush(self, store): - # Mocks - TEST_TP = TP("a", 0) - TEST_OFFSET = 0 - OFFSET_KEY = store.get_offset_key(TEST_TP).encode() - - def real_set_scenario(key, value, offset): - store._set(key, value) - store._bigtable_mutate.reset_mock() - store.set_persisted_offset(TEST_TP, offset) - return offset + 1 - - def real_del_scenario(key, offset): - store._del(key) - store._cache.submit_mutation.reset_mock() - store.set_persisted_offset(TEST_TP, offset) - return offset + 1 - - def assert_offset_persisted(offset): - store._cache.submit_mutation.assert_called_with( - OFFSET_KEY, str(offset).encode() - ) - - row_mock = MagicMock() - row_mock.row_key = b"\x00TEST_KEY1" - - store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._bigtable_mutate = MagicMock(wraps=store._bigtable_mutate) - store._cache.submit_mutation = MagicMock( - wraps=store._cache.submit_mutation - ) - - partition = 0 - faust.stores.bigtable.get_current_partition = MagicMock( - return_value=partition - ) - store._cache.set_partition = MagicMock() - res = store._contains(self.TEST_KEY1) - store._bigtable_mutate.assert_not_called() - assert res is False - - TEST_OFFSET = real_set_scenario( - self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET - ) - res = store._contains(self.TEST_KEY1) - assert_offset_persisted(TEST_OFFSET - 1) - assert res is True - - TEST_OFFSET = real_set_scenario( - self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET - ) - res = store._contains(self.TEST_KEY1) - assert res is True - assert_offset_persisted(TEST_OFFSET - 1) - - TEST_OFFSET = real_del_scenario(self.TEST_KEY1, TEST_OFFSET) - res = store._contains(self.TEST_KEY1) - assert res is False - assert_offset_persisted(TEST_OFFSET - 1) - - TEST_OFFSET = real_set_scenario( - self.TEST_KEY1, self.TEST_KEY1, TEST_OFFSET - ) - res = store._contains(self.TEST_KEY1) - assert_offset_persisted(TEST_OFFSET - 1) - assert res is True From 37287f1a63f122b460d6125efcb332a44ac7095b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 22 Sep 2023 15:17:16 +0200 Subject: [PATCH 490/616] reduced number for max num mutations --- tests/unit/stores/test_bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index ccd46b31e..87958bb65 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -508,7 +508,7 @@ def test_set_persisted_offset(self, store): mutations = [ r[0] for r in store._mutation_buffer.copy().values() ] - store._num_mutations = 999999999999999999999999999999999999999999 + store._num_mutations = 9999999999999999999999999999999 store.set_persisted_offset(tp, 123) store._bigtable_set.called_once_with( expected_offset_key, From b8270ae2f92e9506ced8ea56abb38b92995136c4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 25 Sep 2023 16:47:36 +0200 Subject: [PATCH 491/616] check offset key in read rows --- faust/stores/bigtable.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9fead3f8a..1acf4eb45 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -330,6 +330,10 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): + # abort it key is an offset key + if self.offset_key_prefix in row.row_key: + continue + if self._mutation_buffer is not None: # Yield the mutation first if it exists mutation_row, mutation_val = self._mutation_buffer.get( @@ -338,6 +342,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: key = self._remove_partition_prefix_from_bigtable_key( row.row_key ) + + if mutation_val is not None: if self._cache is not None: self._cache[key] = mutation_val From d23e5d6ebee2b3235ee60832c65a98ebe409e8ab Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 08:19:08 +0200 Subject: [PATCH 492/616] decode offset key before iteration --- faust/stores/bigtable.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1acf4eb45..861129b96 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -327,11 +327,12 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: start_key=start_key, end_key=end_key ) + offset_key_prefix = self.offset_key_prefix.encode() for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter ): # abort it key is an offset key - if self.offset_key_prefix in row.row_key: + if offset_key_prefix in row.row_key: continue if self._mutation_buffer is not None: From 88de63b04c314609ae2e4fce4ced44c2cab70c57 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 09:31:02 +0200 Subject: [PATCH 493/616] reduce mutation buffer and increase cache --- faust/stores/bigtable.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 861129b96..fe2286ba4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,4 @@ """BigTable storage.""" -import asyncio -import gc import logging import time import traceback @@ -104,8 +102,8 @@ def _set_options(self, options) -> None: ) # TODO - make this a configurable option - self._cache = LRUCache(limit=10_000) - self._mutation_buffer_size = 10_000 + self._cache = LRUCache(limit=100_000) + self._mutation_buffer_size = 1_000 self._mutation_buffer = {} self._num_mutations = 0 From 9a11ef4b9095cf7b502783e0895df7f3af1e64e4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 09:36:10 +0200 Subject: [PATCH 494/616] faster changelog read --- faust/stores/bigtable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fe2286ba4..34996784e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -498,6 +498,8 @@ def apply_changelog_batch( of a changelog event. """ tp_offsets: Dict[TP, int] = {} + mutation_buffer_size = self._mutation_buffer_size + self._mutation_buffer_size = 50_000 for event in batch: tp, offset = event.message.tp, event.message.offset tp_offsets[tp] = ( @@ -516,6 +518,7 @@ def apply_changelog_batch( for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) + self._mutation_buffer_size = mutation_buffer_size async def backup_partition( self, From 613ff269ab5adeaa98a8267a489f0a8c32475c13 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 10:41:31 +0200 Subject: [PATCH 495/616] switch off mutattion buffer --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 34996784e..138ccb104 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -104,7 +104,7 @@ def _set_options(self, options) -> None: # TODO - make this a configurable option self._cache = LRUCache(limit=100_000) self._mutation_buffer_size = 1_000 - self._mutation_buffer = {} + self._mutation_buffer = None self._num_mutations = 0 def _bigtable_setup(self, table, options: Dict[str, Any]): From 37f8e8cbf8a4977b40a1c6533fe8f3a1884775c1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 11:40:05 +0200 Subject: [PATCH 496/616] always use directrow --- faust/stores/bigtable.py | 41 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 138ccb104..553605107 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -103,8 +103,8 @@ def _set_options(self, options) -> None: # TODO - make this a configurable option self._cache = LRUCache(limit=100_000) - self._mutation_buffer_size = 1_000 - self._mutation_buffer = None + self._mutation_buffer_size = 90_000 + self._mutation_buffer = {} self._num_mutations = 0 def _bigtable_setup(self, table, options: Dict[str, Any]): @@ -219,13 +219,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): ] for key in keys: - row = None - if self._mutation_buffer is not None: - row = self._mutation_buffer.get(key, (None, None))[0] - - if row is None: - row = self.bt_table.direct_row(key) - + row = self.bt_table.direct_row(key) row.delete() if self._mutation_buffer is not None: self._set_mutation(key, row, None) @@ -233,7 +227,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): row.commit() def _bigtable_set( - self, key: bytes, value: bytes, no_key_translation=False + self, key: bytes, value: bytes, no_key_translation=False, no_mutate=False ): keys = ( [key] @@ -242,12 +236,7 @@ def _bigtable_set( ) assert len(keys) == 1 key = keys[0] - row = None - if self._mutation_buffer is not None: - row = self._mutation_buffer.get(key, (None, None))[0] - - if row is None: - row = self.bt_table.direct_row(key) + row = self.bt_table.direct_row(key) row.set_cell( COLUMN_FAMILY_ID, @@ -255,7 +244,7 @@ def _bigtable_set( value, ) - if self._mutation_buffer is not None: + if self._mutation_buffer is not None and not no_mutate: self._set_mutation(key, row, value) else: row.commit() @@ -388,11 +377,6 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - - # Check cache - if self._cache is not None and key in self._cache: - return self._cache[key] is not None - return self._get(key) is not None except Exception as ex: @@ -432,6 +416,9 @@ def persisted_offset(self, tp: TP) -> Optional[int]: return int(offset) if offset is not None else None def _flush_mutation_buffer(self, offset: int, offset_key): + self._bigtable_set( + offset_key, str(offset).encode(), no_key_translation=True + ) mutations = [ r[0] for r in self._mutation_buffer.copy().values() ] @@ -447,16 +434,18 @@ def _flush_mutation_buffer(self, offset: int, offset_key): self.log.info( f"Committed {self._num_mutations} mutations to BigTableStore for table {self.table.name}" ) - self._bigtable_set( - offset_key, str(offset).encode(), no_key_translation=True - ) self._num_mutations = 0 + self._last_flush_time = time.time() def _should_flush_mutations(self) -> bool: return ( self._mutation_buffer is not None - and self._num_mutations > self._mutation_buffer_size + and ( + self._num_mutations > self._mutation_buffer_size + or self._last_flush_time is None + or self._last_flush_time < time.time() - self._flush_interval + ) ) def set_persisted_offset(self, tp: TP, offset: int) -> None: From 52c3082ebcac2eeeb03a171599a7b7a5941f35b5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 11:42:23 +0200 Subject: [PATCH 497/616] removed no mutate, added timer for flush --- faust/stores/bigtable.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 553605107..fa2df99fa 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -106,6 +106,8 @@ def _set_options(self, options) -> None: self._mutation_buffer_size = 90_000 self._mutation_buffer = {} self._num_mutations = 0 + self._flush_interval = 600 # 10 minutes + self._last_flush_time = None def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) @@ -227,7 +229,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): row.commit() def _bigtable_set( - self, key: bytes, value: bytes, no_key_translation=False, no_mutate=False + self, key: bytes, value: bytes, no_key_translation=False ): keys = ( [key] @@ -244,7 +246,7 @@ def _bigtable_set( value, ) - if self._mutation_buffer is not None and not no_mutate: + if self._mutation_buffer is not None: self._set_mutation(key, row, value) else: row.commit() From f9bc4cee5bbacede957cb732d1ec0d773242d19f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 11:45:03 +0200 Subject: [PATCH 498/616] fixed unit tests and removed unused imports --- faust/stores/bigtable.py | 1 - tests/unit/stores/test_bigtable.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fa2df99fa..58710b190 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -9,7 +9,6 @@ Iterable, Iterator, Optional, - Set, Tuple, Union, ) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 87958bb65..dfa1c489e 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,9 +1,8 @@ -import time from unittest.mock import MagicMock, call, patch +import time import pytest -import faust from faust.stores.bigtable import ( BigTableStore, ) @@ -483,6 +482,7 @@ def test_get_offset_key(self, store): def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) expected_offset_key = store.get_offset_key(tp).encode() + store._last_flush_time = time.time() store._bigtable_set = MagicMock() store.bt_table.mutate_rows = MagicMock() store._mutation_buffer = None From 3ddbd9be1c60b84b277cc4419320839d6d3800c8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 26 Sep 2023 14:17:33 +0200 Subject: [PATCH 499/616] use no caches at all --- faust/stores/bigtable.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 58710b190..7352f86c6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -101,9 +101,9 @@ def _set_options(self, options) -> None: ) # TODO - make this a configurable option - self._cache = LRUCache(limit=100_000) + self._cache = None # LRUCache(limit=100_000) self._mutation_buffer_size = 90_000 - self._mutation_buffer = {} + self._mutation_buffer = None self._num_mutations = 0 self._flush_interval = 600 # 10 minutes self._last_flush_time = None @@ -328,12 +328,12 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: mutation_row, mutation_val = self._mutation_buffer.get( row.row_key, (None, None) ) - key = self._remove_partition_prefix_from_bigtable_key( - row.row_key - ) if mutation_val is not None: + key = self._remove_partition_prefix_from_bigtable_key( + row.row_key + ) if self._cache is not None: self._cache[key] = mutation_val yield key, mutation_val From b5e342c04d16fb9eca7df7c84f1e1e5d337f45e1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 27 Sep 2023 13:54:38 +0200 Subject: [PATCH 500/616] faster iteritems --- faust/stores/bigtable.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7352f86c6..0e107d21a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -305,7 +305,9 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: active_partitions = list(self._active_partitions()) row_set = RowSet() - if not (self.table.is_global or self.table.use_partitioner): + + need_all_keys = (self.table.is_global or self.table.use_partitioner) + if not need_all_keys: for partition in active_partitions: prefix = self._add_partition_prefix_to_key(b"", partition) start_key = prefix + b"\x00" @@ -320,7 +322,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row_set=row_set, filter_=self.row_filter ): # abort it key is an offset key - if offset_key_prefix in row.row_key: + if not need_all_keys and offset_key_prefix in row.row_key: continue if self._mutation_buffer is not None: From 5eee149904607d67380d994eb0e45251a178f57a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 27 Sep 2023 13:55:13 +0200 Subject: [PATCH 501/616] faster iteritems --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0e107d21a..66d7b4128 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -322,7 +322,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row_set=row_set, filter_=self.row_filter ): # abort it key is an offset key - if not need_all_keys and offset_key_prefix in row.row_key: + if need_all_keys and offset_key_prefix in row.row_key: continue if self._mutation_buffer is not None: From abd1612b4f1c08a2c6eff771035cabc40e8f67ff Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 27 Sep 2023 14:47:25 +0200 Subject: [PATCH 502/616] add exception for existing table exception --- faust/stores/bigtable.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 66d7b4128..e53c50113 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -21,6 +21,7 @@ from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table + from google.api_core.exceptions import AlreadyExists # Make one container for all imported functions # This is needed for testing and controlling the imports @@ -119,15 +120,23 @@ def _bigtable_setup(self, table, options: Dict[str, Any]): ) self.bt_table: BT.Table = self.instance.table(self.bt_table_name) if not self.bt_table.exists(): + try: + self.bt_table.create( + column_families={ + COLUMN_FAMILY_ID: BT.column_family.MaxVersionsGCRule(1) + } + ) + except AlreadyExists: + logging.getLogger(__name__).info( + "BigTableStore: Using existing " + f"bigtablestore with {self.bt_table_name=} for {table.name} " + f"with {options=} due to AlreadyExists exception" + ) + return logging.getLogger(__name__).info( f"BigTableStore: Making new bigtablestore with {self.bt_table_name=} " f"for {table.name} with {options=}" ) - self.bt_table.create( - column_families={ - COLUMN_FAMILY_ID: BT.column_family.MaxVersionsGCRule(1) - } - ) else: logging.getLogger(__name__).info( "BigTableStore: Using existing " From 1bdb7b05d154c3a1ea4641604ebe6cd55c2283ad Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 28 Sep 2023 14:43:36 +0200 Subject: [PATCH 503/616] log which partitions were used for iteritems --- faust/stores/bigtable.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e53c50113..294fad82e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -364,7 +364,11 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: yield key, value end = time.time() - self.log.info(f"{self.table_name} _iteritems took {end - start}s") + self.log.info( + f"{self.table_name} _iteritems took {end - start}s " + f"with {need_all_keys=} ", + f"for partitions {active_partitions}" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " From 4141eca1a5d69c939bfefdae1646e68983af0859 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 28 Sep 2023 16:08:17 +0200 Subject: [PATCH 504/616] removed comma --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 294fad82e..bce77b25e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -366,7 +366,7 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: end = time.time() self.log.info( f"{self.table_name} _iteritems took {end - start}s " - f"with {need_all_keys=} ", + f"with {need_all_keys=} " f"for partitions {active_partitions}" ) except Exception as ex: From f8f972f03b328f6e6795e9b30969e97f46f26a26 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 28 Sep 2023 17:30:22 +0200 Subject: [PATCH 505/616] fill value cache on startup --- faust/stores/bigtable.py | 119 ++++++++++++++++++++++++--------------- 1 file changed, 75 insertions(+), 44 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bce77b25e..4b065e28a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -8,12 +8,14 @@ Dict, Iterable, Iterator, + List, Optional, Tuple, Union, ) try: # pragma: no cover + from google.api_core.exceptions import AlreadyExists from google.cloud.bigtable import column_family from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance @@ -21,7 +23,6 @@ from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table - from google.api_core.exceptions import AlreadyExists # Make one container for all imported functions # This is needed for testing and controlling the imports @@ -61,7 +62,7 @@ class BigTableStore(base.SerializedStore): client: BT.Client instance: BT.Instance bt_table: BT.Table - _cache: Optional[LRUCache] + _value_cache: Optional[LRUCache] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -102,7 +103,11 @@ def _set_options(self, options) -> None: ) # TODO - make this a configurable option - self._cache = None # LRUCache(limit=100_000) + self._value_cache_enable = True + if self._value_cache_enable: + self._value_cache: Dict[bytes, bytes] = {} + else: + self._value_cache = None self._mutation_buffer_size = 90_000 self._mutation_buffer = None self._num_mutations = 0 @@ -261,14 +266,14 @@ def _bigtable_set( def _get(self, key: bytes) -> Optional[bytes]: try: - if self._cache is not None: - if key in self._cache: - return self._cache.get(key) + if self._value_cache is not None: + if key in self._value_cache: + return self._value_cache.get(key) value = self._bigtable_get(key) - if self._cache is not None: - self._cache[key] = value + if self._value_cache is not None: + self._value_cache[key] = value if value is not None: self.log.info(f"Found value for key in table {key=} {value=}") @@ -282,8 +287,8 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - if self._cache is not None: - self._cache[key] = value + if self._value_cache is not None: + self._value_cache[key] = value self._bigtable_set(key, value) except Exception as ex: @@ -296,8 +301,8 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - if self._cache is not None: - self._cache[key] = None + if self._value_cache is not None: + self._value_cache[key] = None self._bigtable_del(key) except Exception as ex: @@ -308,16 +313,16 @@ def _del(self, key: bytes) -> None: ) raise ex - def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: + def _bigtable_iteritems(self, partitions): try: start = time.time() - active_partitions = list(self._active_partitions()) - + if partitions is None: + partitions = list(self._active_partitions()) row_set = RowSet() - need_all_keys = (self.table.is_global or self.table.use_partitioner) + need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: - for partition in active_partitions: + for partition in partitions: prefix = self._add_partition_prefix_to_key(b"", partition) start_key = prefix + b"\x00" end_key = prefix + b"\xff" @@ -340,34 +345,21 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: row.row_key, (None, None) ) - if mutation_val is not None: key = self._remove_partition_prefix_from_bigtable_key( row.row_key ) - if self._cache is not None: - self._cache[key] = mutation_val yield key, mutation_val continue - if mutation_row is not None: - if self._cache is not None: - self._cache[key] = None - continue - value = self.bigtable_exrtact_row_data(row) key = self._remove_partition_prefix_from_bigtable_key( row.row_key ) - if self._cache is not None: - self._cache[key] = value yield key, value - end = time.time() self.log.info( f"{self.table_name} _iteritems took {end - start}s " - f"with {need_all_keys=} " - f"for partitions {active_partitions}" ) except Exception as ex: self.log.error( @@ -377,6 +369,17 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: ) raise ex + def _iteritems( + self, partitions: Optional[List[int]] = None + ) -> Iterator[Tuple[bytes, bytes]]: + if self._value_cache is not None: + # We always want to return the whole cache + for key, value in self._value_cache.items(): + if value is not None: + yield key, value + else: + yield from self._bigtable_iteritems(partitions) + def _iterkeys(self) -> Iterator[bytes]: for row in self._iteritems(): yield row[0] @@ -435,16 +438,12 @@ def _flush_mutation_buffer(self, offset: int, offset_key): self._bigtable_set( offset_key, str(offset).encode(), no_key_translation=True ) - mutations = [ - r[0] for r in self._mutation_buffer.copy().values() - ] + mutations = [r[0] for r in self._mutation_buffer.copy().values()] response = self.bt_table.mutate_rows(mutations) for i, status in enumerate(response): if status.code != 0: - raise Exception( - f"Failed to commit mutation number {i}" - ) + raise Exception(f"Failed to commit mutation number {i}") self._mutation_buffer = {} self.log.info( @@ -453,15 +452,11 @@ def _flush_mutation_buffer(self, offset: int, offset_key): self._num_mutations = 0 self._last_flush_time = time.time() - def _should_flush_mutations(self) -> bool: - return ( - self._mutation_buffer is not None - and ( - self._num_mutations > self._mutation_buffer_size - or self._last_flush_time is None - or self._last_flush_time < time.time() - self._flush_interval - ) + return self._mutation_buffer is not None and ( + self._num_mutations > self._mutation_buffer_size + or self._last_flush_time is None + or self._last_flush_time < time.time() - self._flush_interval ) def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -548,3 +543,39 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") + + def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: + # remove all keys for partitions we are no longer from the value cache + if self._value_cache is not None: + partitions = [tp.partition for tp in tps] + for key in self._value_cache.copy().keys(): + if self._get_partition_from_bigtable_key(key) in partitions: + del self._value_cache[key] + + async def assign_partitions( + self, table: CollectionT, tps: Set[TP], generation_id: int = 0 + ) -> None: + # Fill cache with all keys for the partitions we are assigned + partitions = [tp.partition for tp in tps] + if self._value_cache is not None: + for k, v in self._bigtable_iteritems(partitions=partitions): + self._value_cache[k] = v + + async def on_rebalance( + self, + assigned: Set[TP], + revoked: Set[TP], + newly_assigned: Set[TP], + generation_id: int = 0, + ) -> None: + """Rebalance occurred. + + Arguments: + assigned: Set of all assigned topic partitions. + revoked: Set of newly revoked topic partitions. + newly_assigned: Set of newly assigned topic partitions, + for which we were not assigned the last time. + generation_id: the metadata generation identifier for the re-balance + """ + self.revoke_partitions(self.table, revoked) + await self.assign_partitions(self.table, newly_assigned, generation_id) From 2015381e601ed6f780eeb18de264332edbfe4da7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 28 Sep 2023 17:41:41 +0200 Subject: [PATCH 506/616] added set --- faust/stores/bigtable.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4b065e28a..469623db8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -10,6 +10,7 @@ Iterator, List, Optional, + Set, Tuple, Union, ) @@ -319,6 +320,10 @@ def _bigtable_iteritems(self, partitions): if partitions is None: partitions = list(self._active_partitions()) row_set = RowSet() + self.log.info( + f"BigtableStore: Iterating over {len(partitions)} partitions " + f"for table {self.table_name}" + ) need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: @@ -358,9 +363,7 @@ def _bigtable_iteritems(self, partitions): ) yield key, value end = time.time() - self.log.info( - f"{self.table_name} _iteritems took {end - start}s " - ) + self.log.info(f"{self.table_name} _iteritems took {end - start}s ") except Exception as ex: self.log.error( f"FaustBigtableException Error " From a7260259672805f39b8eec40c31c45900cd7fd3a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 28 Sep 2023 18:10:49 +0200 Subject: [PATCH 507/616] added logging and use proper partitions --- faust/stores/bigtable.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 469623db8..d80454653 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -363,7 +363,10 @@ def _bigtable_iteritems(self, partitions): ) yield key, value end = time.time() - self.log.info(f"{self.table_name} _iteritems took {end - start}s ") + self.log.info( + f"{self.table_name} _bigtable_iteritems took {end - start}s " + f"for partitions {partitions}" + ) except Exception as ex: self.log.error( f"FaustBigtableException Error " @@ -547,19 +550,19 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: - # remove all keys for partitions we are no longer from the value cache - if self._value_cache is not None: - partitions = [tp.partition for tp in tps] - for key in self._value_cache.copy().keys(): - if self._get_partition_from_bigtable_key(key) in partitions: - del self._value_cache[key] - async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: # Fill cache with all keys for the partitions we are assigned - partitions = [tp.partition for tp in tps] + partitions = set() + + standby_tps = self.app.assignor.assigned_standbys() + my_topics = table.changelog_topic.topics + for tp in tps: + if tp.topic in my_topics and tp not in standby_tps: + partitions.add(tp.partition) + if len(partitions) == 0: + return if self._value_cache is not None: for k, v in self._bigtable_iteritems(partitions=partitions): self._value_cache[k] = v @@ -580,5 +583,4 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) From dabd88c801f0d1b5a58710806fab407ccd1915af Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 09:17:30 +0200 Subject: [PATCH 508/616] added keycache --- faust/stores/bigtable.py | 91 ++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d80454653..5299e76ee 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -39,7 +39,6 @@ class BT: except ImportError: # pragma: no cover BT = None # noqa -from mode.utils.collections import LRUCache from yarl import URL from faust.stores import base @@ -63,7 +62,8 @@ class BigTableStore(base.SerializedStore): client: BT.Client instance: BT.Instance bt_table: BT.Table - _value_cache: Optional[LRUCache] + _startup_cache: Optional[Dict[bytes, bytes]] + _key_cache: Optional[Set[bytes]] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" @@ -104,11 +104,21 @@ def _set_options(self, options) -> None: ) # TODO - make this a configurable option - self._value_cache_enable = True - if self._value_cache_enable: - self._value_cache: Dict[bytes, bytes] = {} + self._startup_cache_enable = True + if self._startup_cache_enable: + self._startup_cache: Dict[bytes, bytes] = {} else: - self._value_cache = None + self._startup_cache = None + + # TODO - make this a configurable option + self._key_cache_enable = True + if self._key_cache_enable: + self._key_cache: Set[bytes] = set() + else: + self._key_cache = None + + # TODO - make this a configurable option + # and use the MutationBatcher class of bt self._mutation_buffer_size = 90_000 self._mutation_buffer = None self._num_mutations = 0 @@ -265,21 +275,15 @@ def _bigtable_set( else: row.commit() - def _get(self, key: bytes) -> Optional[bytes]: + def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: try: - if self._value_cache is not None: - if key in self._value_cache: - return self._value_cache.get(key) - - value = self._bigtable_get(key) - - if self._value_cache is not None: - self._value_cache[key] = value - - if value is not None: - self.log.info(f"Found value for key in table {key=} {value=}") - return value - return None + if self._startup_cache is not None: + if key in self._startup_cache: + if invalidate_cache: + return self._startup_cache.pop(key) + else: + return self._startup_cache[key] + return self._bigtable_get(key) except Exception as ex: self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" @@ -288,9 +292,8 @@ def _get(self, key: bytes) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - if self._value_cache is not None: - self._value_cache[key] = value - + if self._key_cache is not None: + self._key_cache.add(key) self._bigtable_set(key, value) except Exception as ex: self.log.error( @@ -302,9 +305,10 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - if self._value_cache is not None: - self._value_cache[key] = None - + if self._startup_cache is not None: + self._startup_cache.pop(key, None) + if self._key_cache is not None: + self._key_cache.discard(key) self._bigtable_del(key) except Exception as ex: self.log.error( @@ -378,17 +382,15 @@ def _bigtable_iteritems(self, partitions): def _iteritems( self, partitions: Optional[List[int]] = None ) -> Iterator[Tuple[bytes, bytes]]: - if self._value_cache is not None: - # We always want to return the whole cache - for key, value in self._value_cache.items(): - if value is not None: - yield key, value - else: - yield from self._bigtable_iteritems(partitions) + yield from self._bigtable_iteritems(partitions) def _iterkeys(self) -> Iterator[bytes]: - for row in self._iteritems(): - yield row[0] + if self._key_cache is not None: + for key in self._key_cache: + yield key + else: + for row in self._iteritems(): + yield row[0] def _itervalues(self) -> Iterator[bytes]: for row in self._iteritems(): @@ -402,7 +404,10 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - return self._get(key) is not None + if self._key_cache is not None: + return key in self._key_cache + else: + return self._get(key, invalidate_cache=False) is not None except Exception as ex: self.log.error( @@ -554,18 +559,24 @@ async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: # Fill cache with all keys for the partitions we are assigned - partitions = set() + if self._startup_cache is None and self._key_cache is None: + return + partitions = set() standby_tps = self.app.assignor.assigned_standbys() my_topics = table.changelog_topic.topics for tp in tps: if tp.topic in my_topics and tp not in standby_tps: partitions.add(tp.partition) + if len(partitions) == 0: return - if self._value_cache is not None: - for k, v in self._bigtable_iteritems(partitions=partitions): - self._value_cache[k] = v + + for k, v in self._bigtable_iteritems(partitions=partitions): + if self._startup_cache is not None: + self._startup_cache[k] = v + if self._key_cache is not None: + self._key_cache.add(k) async def on_rebalance( self, From 2f0da12181efdc04c45100306b3e0e118b6f1947 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 09:36:13 +0200 Subject: [PATCH 509/616] removed key cache from contains --- faust/stores/bigtable.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5299e76ee..441ee3306 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -404,10 +404,7 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - if self._key_cache is not None: - return key in self._key_cache - else: - return self._get(key, invalidate_cache=False) is not None + return self._get(key, invalidate_cache=False) is not None except Exception as ex: self.log.error( From 458ed9e234c3497cbe6eaf1ec6444c9dfa82cce1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 09:37:24 +0200 Subject: [PATCH 510/616] reduce contains --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 441ee3306..81524cc47 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -404,6 +404,8 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True + # We don't want to invalidate the cache here + # because it is very likely that we will need the value soon return self._get(key, invalidate_cache=False) is not None except Exception as ex: From b51806b8fb418f287426d91da0c693c27ffe204c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 11:02:04 +0200 Subject: [PATCH 511/616] added revoke partitons and utility function --- faust/stores/bigtable.py | 44 +++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 81524cc47..15c608d1b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -554,12 +554,27 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - async def assign_partitions( - self, table: CollectionT, tps: Set[TP], generation_id: int = 0 - ) -> None: - # Fill cache with all keys for the partitions we are assigned + def _mutate_caches(self, partitions, operation="add"): + if operation == "add": + for k, v in self._bigtable_iteritems(partitions=partitions): + if self._startup_cache is not None: + self._startup_cache[k] = v + if self._key_cache is not None: + self._key_cache.add(k) + elif operation == "remove": + for k, v in self._bigtable_iteritems(partitions=partitions): + if self._startup_cache is not None: + self._startup_cache.pop(k, None) + if self._key_cache is not None: + self._key_cache.discard(k) + else: + raise ValueError(f"Invalid operation {operation}") + + def _get_active_changelogtopic_partitions( + self, table: CollectionT, tps: Set[TP] + ) -> Set[int]: if self._startup_cache is None and self._key_cache is None: - return + return set() partitions = set() standby_tps = self.app.assignor.assigned_standbys() @@ -567,15 +582,23 @@ async def assign_partitions( for tp in tps: if tp.topic in my_topics and tp not in standby_tps: partitions.add(tp.partition) + return partitions + def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: + # Fill cache with all keys for the partitions we are assigned + partitions = self._get_active_changelogtopic_partitions(table, tps) if len(partitions) == 0: return + self._mutate_caches(partitions, "remove") - for k, v in self._bigtable_iteritems(partitions=partitions): - if self._startup_cache is not None: - self._startup_cache[k] = v - if self._key_cache is not None: - self._key_cache.add(k) + async def assign_partitions( + self, table: CollectionT, tps: Set[TP], generation_id: int = 0 + ) -> None: + # Fill cache with all keys for the partitions we are assigned + partitions = self._get_active_changelogtopic_partitions(table, tps) + if len(partitions) == 0: + return + self._mutate_caches(partitions, "add") async def on_rebalance( self, @@ -593,4 +616,5 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ + self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) From aa7650a2b9d6722440d5c5cee2f4cfeb2fb15d57 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 11:11:17 +0200 Subject: [PATCH 512/616] addde logging --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 15c608d1b..4f5799aa5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -589,6 +589,7 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = self._get_active_changelogtopic_partitions(table, tps) if len(partitions) == 0: return + self.log.info(f"Revoking partitions {partitions} for {table.name}") self._mutate_caches(partitions, "remove") async def assign_partitions( @@ -598,6 +599,7 @@ async def assign_partitions( partitions = self._get_active_changelogtopic_partitions(table, tps) if len(partitions) == 0: return + self.log.info(f"Assigning partitions {partitions} for {table.name}") self._mutate_caches(partitions, "add") async def on_rebalance( From 7b866257ace597fe4624ca919247dd7864735a93 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 11:34:27 +0200 Subject: [PATCH 513/616] remove revoke partition --- faust/stores/bigtable.py | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4f5799aa5..58ff77578 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -121,9 +121,7 @@ def _set_options(self, options) -> None: # and use the MutationBatcher class of bt self._mutation_buffer_size = 90_000 self._mutation_buffer = None - self._num_mutations = 0 self._flush_interval = 600 # 10 minutes - self._last_flush_time = None def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) @@ -554,21 +552,12 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - def _mutate_caches(self, partitions, operation="add"): - if operation == "add": - for k, v in self._bigtable_iteritems(partitions=partitions): - if self._startup_cache is not None: - self._startup_cache[k] = v - if self._key_cache is not None: - self._key_cache.add(k) - elif operation == "remove": - for k, v in self._bigtable_iteritems(partitions=partitions): - if self._startup_cache is not None: - self._startup_cache.pop(k, None) - if self._key_cache is not None: - self._key_cache.discard(k) - else: - raise ValueError(f"Invalid operation {operation}") + def _fill_caches(self, partitions): + for k, v in self._bigtable_iteritems(partitions=partitions): + if self._startup_cache is not None: + self._startup_cache[k] = v + if self._key_cache is not None: + self._key_cache.add(k) def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] @@ -584,14 +573,6 @@ def _get_active_changelogtopic_partitions( partitions.add(tp.partition) return partitions - def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: - # Fill cache with all keys for the partitions we are assigned - partitions = self._get_active_changelogtopic_partitions(table, tps) - if len(partitions) == 0: - return - self.log.info(f"Revoking partitions {partitions} for {table.name}") - self._mutate_caches(partitions, "remove") - async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: @@ -600,7 +581,7 @@ async def assign_partitions( if len(partitions) == 0: return self.log.info(f"Assigning partitions {partitions} for {table.name}") - self._mutate_caches(partitions, "add") + self._fill_caches(partitions) async def on_rebalance( self, @@ -618,5 +599,4 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) From cac2ddca76c74245df72009e9297598db6ca5a71 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 12:43:44 +0200 Subject: [PATCH 514/616] add first version of mutation buffer again --- faust/stores/bigtable.py | 78 ++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 47 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 58ff77578..9fb6afaff 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -24,6 +24,7 @@ from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table + from google.cloud.bigtable.batcher import MutationsBatcher # Make one container for all imported functions # This is needed for testing and controlling the imports @@ -119,9 +120,20 @@ def _set_options(self, options) -> None: # TODO - make this a configurable option # and use the MutationBatcher class of bt - self._mutation_buffer_size = 90_000 - self._mutation_buffer = None - self._flush_interval = 600 # 10 minutes + self._mutation_buffer_enable = True + if self._mutation_buffer_enable: + self._mutation_buffer_size = 20_000 + self._mutation_buffer = {} + self._flush_interval = 300 # 5 minutes + self._mutation_batcher = MutationsBatcher( + self.bt_table, + flush_count=self._mutation_buffer_size, + flush_interval=self._flush_interval, + batch_completed_callback=lambda x: self._mutation_buffer.clear(), + ) + + else: + self._mutation_buffer = None def _bigtable_setup(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) @@ -215,23 +227,26 @@ def _bigtable_get( ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) for bt_key in keys: - if self._mutation_buffer is not None: - mutation_row, mutation_val = self._mutation_buffer.get( - bt_key, (None, None) - ) - if mutation_row is not None: - return mutation_val + if ( + self._mutation_buffer is not None + and bt_key in self._mutation_buffer + ): + return self._mutation_buffer[key] + for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is None: return None return self.bigtable_exrtact_row_data(res) def _set_mutation( - self, key: bytes, row: DirectRow, value: Optional[bytes] + self, + key: bytes, + row: DirectRow, + value: Optional[bytes] ): - self._mutation_buffer[key] = (row, value) - self._num_mutations += 1 + self._mutation_buffer[key] = value + self._mutation_batcher.mutate(row) def _bigtable_del(self, key: bytes, no_key_translation=False): if no_key_translation: @@ -348,10 +363,7 @@ def _bigtable_iteritems(self, partitions): if self._mutation_buffer is not None: # Yield the mutation first if it exists - mutation_row, mutation_val = self._mutation_buffer.get( - row.row_key, (None, None) - ) - + mutation_val = self._mutation_buffer.get(row.row_key, None) if mutation_val is not None: key = self._remove_partition_prefix_from_bigtable_key( row.row_key @@ -442,31 +454,6 @@ def persisted_offset(self, tp: TP) -> Optional[int]: offset = self._bigtable_get(offset_key, no_key_translation=True) return int(offset) if offset is not None else None - def _flush_mutation_buffer(self, offset: int, offset_key): - self._bigtable_set( - offset_key, str(offset).encode(), no_key_translation=True - ) - mutations = [r[0] for r in self._mutation_buffer.copy().values()] - response = self.bt_table.mutate_rows(mutations) - - for i, status in enumerate(response): - if status.code != 0: - raise Exception(f"Failed to commit mutation number {i}") - - self._mutation_buffer = {} - self.log.info( - f"Committed {self._num_mutations} mutations to BigTableStore for table {self.table.name}" - ) - self._num_mutations = 0 - self._last_flush_time = time.time() - - def _should_flush_mutations(self) -> bool: - return self._mutation_buffer is not None and ( - self._num_mutations > self._mutation_buffer_size - or self._last_flush_time is None - or self._last_flush_time < time.time() - self._flush_interval - ) - def set_persisted_offset(self, tp: TP, offset: int) -> None: """Set the last persisted offset for this table. @@ -477,12 +464,9 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - if self._should_flush_mutations(): - self._flush_mutation_buffer(offset, offset_key) - elif self._mutation_buffer is None: - self._bigtable_set( - offset_key, str(offset).encode(), no_key_translation=True - ) + self._bigtable_set( + offset_key, str(offset).encode(), no_key_translation=True + ) except Exception: self.log.error( f"Failed to commit offset for {self.table.name}" From e6e503027eaa3212fe637893d6cb7060cf935810 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 12:54:08 +0200 Subject: [PATCH 515/616] moved setup of mutation buffer --- faust/stores/bigtable.py | 71 ++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9fb6afaff..7d81b4c21 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -18,13 +18,13 @@ try: # pragma: no cover from google.api_core.exceptions import AlreadyExists from google.cloud.bigtable import column_family + from google.cloud.bigtable.batcher import MutationsBatcher from google.cloud.bigtable.client import Client from google.cloud.bigtable.instance import Instance from google.cloud.bigtable.row import DirectRow from google.cloud.bigtable.row_filters import CellsColumnLimitFilter from google.cloud.bigtable.row_set import RowSet from google.cloud.bigtable.table import Table - from google.cloud.bigtable.batcher import MutationsBatcher # Make one container for all imported functions # This is needed for testing and controlling the imports @@ -84,7 +84,9 @@ def __init__( ) -> None: self._set_options(options) try: - self._bigtable_setup(table, options) + self._setup_bigtable(table, options) + self._setup_caches() + self._setup_mutation_batcher() except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -94,30 +96,7 @@ def __init__( def default_translator(user_key): return user_key - def _set_options(self, options) -> None: - self._all_options = options - self.table_name_generator = options.get( - BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name - ) - self.row_filter = BT.CellsColumnLimitFilter(1) - self.offset_key_prefix = options.get( - BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" - ) - - # TODO - make this a configurable option - self._startup_cache_enable = True - if self._startup_cache_enable: - self._startup_cache: Dict[bytes, bytes] = {} - else: - self._startup_cache = None - - # TODO - make this a configurable option - self._key_cache_enable = True - if self._key_cache_enable: - self._key_cache: Set[bytes] = set() - else: - self._key_cache = None - + def _setup_mutation_batcher(self): # TODO - make this a configurable option # and use the MutationBatcher class of bt self._mutation_buffer_enable = True @@ -131,11 +110,42 @@ def _set_options(self, options) -> None: flush_interval=self._flush_interval, batch_completed_callback=lambda x: self._mutation_buffer.clear(), ) - else: self._mutation_buffer = None - def _bigtable_setup(self, table, options: Dict[str, Any]): + + + def _setup_caches(self, ): + + # TODO - make this a configurable option + self._startup_cache_enable = True + if self._startup_cache_enable: + self._startup_cache: Dict[bytes, bytes] = {} + else: + self._startup_cache = None + + # TODO - make this a configurable option + self._key_cache_enable = True + if self._key_cache_enable: + self._key_cache: Set[bytes] = set() + else: + self._key_cache = None + + + + def _set_options(self, options) -> None: + self._all_options = options + self.table_name_generator = options.get( + BigTableStore.BT_TABLE_NAME_GENERATOR_KEY, lambda t: t.name + ) + self.row_filter = BT.CellsColumnLimitFilter(1) + self.offset_key_prefix = options.get( + BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" + ) + + + + def _setup_bigtable(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: BT.Client = BT.Client( options.get(BigTableStore.BT_PROJECT_KEY), @@ -303,7 +313,7 @@ def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: ) raise ex - def _set(self, key: bytes, value: Optional[bytes]) -> None: + def _set(self, key: bytes, value: Optional[bytes]) -> None:big try: if self._key_cache is not None: self._key_cache.add(key) @@ -490,8 +500,6 @@ def apply_changelog_batch( of a changelog event. """ tp_offsets: Dict[TP, int] = {} - mutation_buffer_size = self._mutation_buffer_size - self._mutation_buffer_size = 50_000 for event in batch: tp, offset = event.message.tp, event.message.offset tp_offsets[tp] = ( @@ -510,7 +518,6 @@ def apply_changelog_batch( for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) - self._mutation_buffer_size = mutation_buffer_size async def backup_partition( self, From bf9da1aa2bbc81c4a7cc01c31c0d3976d8ce3434 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 13:01:28 +0200 Subject: [PATCH 516/616] fixed intendation --- faust/stores/bigtable.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7d81b4c21..ebff9f03d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -113,23 +113,20 @@ def _setup_mutation_batcher(self): else: self._mutation_buffer = None - - def _setup_caches(self, ): + # TODO - make this a configurable option + self._startup_cache_enable = True + if self._startup_cache_enable: + self._startup_cache: Dict[bytes, bytes] = {} + else: + self._startup_cache = None - # TODO - make this a configurable option - self._startup_cache_enable = True - if self._startup_cache_enable: - self._startup_cache: Dict[bytes, bytes] = {} - else: - self._startup_cache = None - - # TODO - make this a configurable option - self._key_cache_enable = True - if self._key_cache_enable: - self._key_cache: Set[bytes] = set() - else: - self._key_cache = None + # TODO - make this a configurable option + self._key_cache_enable = True + if self._key_cache_enable: + self._key_cache: Set[bytes] = set() + else: + self._key_cache = None From e43c692e483d7f68acea48d7a700b3c2f1a21f78 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 13:01:44 +0200 Subject: [PATCH 517/616] fixed stuff --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ebff9f03d..3f0a21b52 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -128,8 +128,6 @@ def _setup_caches(self, ): else: self._key_cache = None - - def _set_options(self, options) -> None: self._all_options = options self.table_name_generator = options.get( From 6fc8b0eaa3297f3b6ec20acbb99505f1afb15624 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 13:02:30 +0200 Subject: [PATCH 518/616] fixed bug --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3f0a21b52..04d8f982f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -308,7 +308,7 @@ def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: ) raise ex - def _set(self, key: bytes, value: Optional[bytes]) -> None:big + def _set(self, key: bytes, value: Optional[bytes]) -> None: try: if self._key_cache is not None: self._key_cache.add(key) From b9907ebc49dd0812cd5c90deb0c2670dae63febf Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 13:02:47 +0200 Subject: [PATCH 519/616] formatting --- faust/stores/bigtable.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 04d8f982f..614ffa4a4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -113,7 +113,9 @@ def _setup_mutation_batcher(self): else: self._mutation_buffer = None - def _setup_caches(self, ): + def _setup_caches( + self, + ): # TODO - make this a configurable option self._startup_cache_enable = True if self._startup_cache_enable: @@ -138,8 +140,6 @@ def _set_options(self, options) -> None: BigTableStore.BT_OFFSET_KEY_PREFIX, "==>offset_for_partition_" ) - - def _setup_bigtable(self, table, options: Dict[str, Any]): self.bt_table_name = self.table_name_generator(table) self.client: BT.Client = BT.Client( @@ -245,10 +245,7 @@ def _bigtable_get( return self.bigtable_exrtact_row_data(res) def _set_mutation( - self, - key: bytes, - row: DirectRow, - value: Optional[bytes] + self, key: bytes, row: DirectRow, value: Optional[bytes] ): self._mutation_buffer[key] = value self._mutation_batcher.mutate(row) From 616fae5fc7f2be082595e24f85f38c3abb72cda4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 14:00:16 +0200 Subject: [PATCH 520/616] logging for flush and get only from table directly --- faust/stores/bigtable.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 614ffa4a4..f4f61c038 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -96,19 +96,26 @@ def __init__( def default_translator(user_key): return user_key + def _flush_mutation_buffer(self): + self.log.info( + f"Flushing mutation buffer with size {len(self._mutation_buffer)}" + f" for table {self.table_name}" + ) + self._mutation_buffer.clear() + def _setup_mutation_batcher(self): # TODO - make this a configurable option # and use the MutationBatcher class of bt self._mutation_buffer_enable = True if self._mutation_buffer_enable: - self._mutation_buffer_size = 20_000 + self._mutation_buffer_size = 10_000 self._mutation_buffer = {} self._flush_interval = 300 # 5 minutes self._mutation_batcher = MutationsBatcher( self.bt_table, flush_count=self._mutation_buffer_size, flush_interval=self._flush_interval, - batch_completed_callback=lambda x: self._mutation_buffer.clear(), + batch_completed_callback=lambda x: self._flush_mutation_buffer() ) else: self._mutation_buffer = None @@ -231,18 +238,13 @@ def _bigtable_get( self, key: bytes, no_key_translation=False ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) - for bt_key in keys: - if ( - self._mutation_buffer is not None - and bt_key in self._mutation_buffer - ): - return self._mutation_buffer[key] - + if self._mutation_buffer is not None: + self._mutation_batcher.flush() for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) - if res is None: - return None - return self.bigtable_exrtact_row_data(res) + if res is not None: + return self.bigtable_exrtact_row_data(res) + return None def _set_mutation( self, key: bytes, row: DirectRow, value: Optional[bytes] @@ -355,6 +357,9 @@ def _bigtable_iteritems(self, partitions): start_key=start_key, end_key=end_key ) + if self._mutation_buffer is not None: + self._mutation_batcher.flush() + offset_key_prefix = self.offset_key_prefix.encode() for row in self.bt_table.read_rows( row_set=row_set, filter_=self.row_filter @@ -363,16 +368,6 @@ def _bigtable_iteritems(self, partitions): if need_all_keys and offset_key_prefix in row.row_key: continue - if self._mutation_buffer is not None: - # Yield the mutation first if it exists - mutation_val = self._mutation_buffer.get(row.row_key, None) - if mutation_val is not None: - key = self._remove_partition_prefix_from_bigtable_key( - row.row_key - ) - yield key, mutation_val - continue - value = self.bigtable_exrtact_row_data(row) key = self._remove_partition_prefix_from_bigtable_key( row.row_key From dca46b4f3dcef13ae719dc8c880c2a6bdad9f39b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 14:14:22 +0200 Subject: [PATCH 521/616] updated startup cache also in apply changelogtopics --- faust/stores/bigtable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f4f61c038..8c22d2dec 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -493,6 +493,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message + if not (self.table.is_global or self.table.use_partitioner): key = self._add_partition_prefix_to_key(msg.key, tp.partition) else: @@ -502,6 +503,8 @@ def apply_changelog_batch( self._bigtable_del(key, no_key_translation=True) else: self._bigtable_set(key, msg.value, no_key_translation=True) + if self._startup_cache is not None: + self._startup_cache[msg.key] = msg.value for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From 924137ff89fc7d5c501b117328e9fef233962d0a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 29 Sep 2023 14:21:08 +0200 Subject: [PATCH 522/616] =?UTF-8?q?removed=20mutation=20buffer=20=E2=9C=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- faust/stores/bigtable.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8c22d2dec..e82d92dd0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -96,29 +96,22 @@ def __init__( def default_translator(user_key): return user_key - def _flush_mutation_buffer(self): + def _on_mutation_batcher_flushed(self): self.log.info( - f"Flushing mutation buffer with size {len(self._mutation_buffer)}" - f" for table {self.table_name}" + f"Flushed mutation buffer for {self.table_name}" ) - self._mutation_buffer.clear() def _setup_mutation_batcher(self): # TODO - make this a configurable option # and use the MutationBatcher class of bt - self._mutation_buffer_enable = True - if self._mutation_buffer_enable: - self._mutation_buffer_size = 10_000 - self._mutation_buffer = {} - self._flush_interval = 300 # 5 minutes + self._mutation_batcher_enable = True + if self._mutation_batcher_enable: self._mutation_batcher = MutationsBatcher( self.bt_table, - flush_count=self._mutation_buffer_size, - flush_interval=self._flush_interval, - batch_completed_callback=lambda x: self._flush_mutation_buffer() + flush_count=10_000, + flush_interval=300, + batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(), ) - else: - self._mutation_buffer = None def _setup_caches( self, @@ -238,7 +231,7 @@ def _bigtable_get( self, key: bytes, no_key_translation=False ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) - if self._mutation_buffer is not None: + if self._mutation_batcher_enable: self._mutation_batcher.flush() for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) @@ -249,7 +242,6 @@ def _bigtable_get( def _set_mutation( self, key: bytes, row: DirectRow, value: Optional[bytes] ): - self._mutation_buffer[key] = value self._mutation_batcher.mutate(row) def _bigtable_del(self, key: bytes, no_key_translation=False): @@ -264,7 +256,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): for key in keys: row = self.bt_table.direct_row(key) row.delete() - if self._mutation_buffer is not None: + if self._mutation_batcher_enable: self._set_mutation(key, row, None) else: row.commit() @@ -287,7 +279,7 @@ def _bigtable_set( value, ) - if self._mutation_buffer is not None: + if self._mutation_batcher_enable: self._set_mutation(key, row, value) else: row.commit() @@ -357,7 +349,7 @@ def _bigtable_iteritems(self, partitions): start_key=start_key, end_key=end_key ) - if self._mutation_buffer is not None: + if self._mutation_batcher_enable: self._mutation_batcher.flush() offset_key_prefix = self.offset_key_prefix.encode() From c5c629312c9f8ae8d38eb22e146cdaea61638a8f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 9 Oct 2023 15:15:44 +0200 Subject: [PATCH 523/616] made fields configurable --- faust/stores/bigtable.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e82d92dd0..0e7698dad 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -63,16 +63,19 @@ class BigTableStore(base.SerializedStore): client: BT.Client instance: BT.Instance bt_table: BT.Table - _startup_cache: Optional[Dict[bytes, bytes]] - _key_cache: Optional[Set[bytes]] BT_COLUMN_NAME_KEY = "bt_column_name_key" BT_INSTANCE_KEY = "bt_instance_key" BT_OFFSET_KEY_PREFIX = "bt_offset_key_prefix" BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" - BT_VALUE_CACHE_ENABLE_KEY = "bt_value_cache_enable_key" - BT_MAX_MUTATIONS_PER_FLUSH_KEY = "bt_max_mutations_per_flush_key" + BT_STARTUP_CACHE_ENABLE_KEY = "bt_startup_cache_enable_key" + BT_KEY_CACHE_ENABLE_KEY = "bt_key_cache_enable_key" + BT_MUTATION_BATCHER_ENABLE_KEY = "bt_mutation_batcher_enable_key" + BT_MUTATION_BATCHER_FLUSH_COUNT_KEY = "bt_mutation_batcher_flush_count_key" + BT_MUTATION_BATCHER_FLUSH_INTERVAL_KEY = ( + "bt_mutation_batcher_flush_interval_key" + ) def __init__( self, @@ -101,30 +104,39 @@ def _on_mutation_batcher_flushed(self): f"Flushed mutation buffer for {self.table_name}" ) - def _setup_mutation_batcher(self): - # TODO - make this a configurable option - # and use the MutationBatcher class of bt - self._mutation_batcher_enable = True + def _setup_mutation_batcher(self, options): + self._mutation_batcher_enable = options.get( + BigTableStore.BT_MUTATION_BATCHER_ENABLE_KEY, False + ) if self._mutation_batcher_enable: + flush_count = options.get( + BigTableStore.BT_MUTATION_BATCHER_FLUSH_COUNT_KEY, 10_000 + ) + flush_interval = options.get( + BigTableStore.BT_MUTATION_BATCHER_FLUSH_INTERVAL_KEY, 300 + ) self._mutation_batcher = MutationsBatcher( self.bt_table, - flush_count=10_000, - flush_interval=300, + flush_count=flush_count, + flush_interval=flush_interval, batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(), ) def _setup_caches( self, + options: Dict[str, Any] = None, ): - # TODO - make this a configurable option - self._startup_cache_enable = True + self._startup_cache_enable = options.get( + BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY, False + ) if self._startup_cache_enable: self._startup_cache: Dict[bytes, bytes] = {} else: self._startup_cache = None - # TODO - make this a configurable option - self._key_cache_enable = True + self._key_cache_enable = options.get( + BigTableStore.BT_KEY_CACHE_ENABLE_KEY, False + ) if self._key_cache_enable: self._key_cache: Set[bytes] = set() else: From e0a1fc4309cb0dbae1d7826ef7496bf7a5e25181 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 9 Oct 2023 15:19:54 +0200 Subject: [PATCH 524/616] fixed some tests for configurations --- tests/unit/stores/test_bigtable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index dfa1c489e..ebff4835d 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -171,7 +171,7 @@ def from_bt_key(key): assert self_mock.table_name_generator == name_lambda @pytest.mark.asyncio - async def test_bigtable_setup(self, bt_imports): + async def test_setup_bigtable(self, bt_imports): self_mock = MagicMock() faust_table_mock = MagicMock() @@ -199,7 +199,7 @@ def table_name_gen(table): options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" options[BigTableStore.BT_PROJECT_KEY] = "bt_project" - return_value = BigTableStore._bigtable_setup( + return_value = BigTableStore._setup_bigtable( self_mock, faust_table_mock, options ) bt_imports.Client.assert_called_once_with( @@ -220,7 +220,7 @@ def table_name_gen(table): faust_table_mock ) table_mock.exists = MagicMock(return_value=False) - return_value = BigTableStore._bigtable_setup( + return_value = BigTableStore._setup_bigtable( self_mock, faust_table_mock, options ) instance_mock.table.assert_called_once_with(self_mock.bt_table_name) @@ -235,7 +235,7 @@ def store(self, bt_imports): options = {} options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" options[BigTableStore.BT_PROJECT_KEY] = "bt_project" - options[BigTableStore.BT_VALUE_CACHE_ENABLE_KEY] = True + options[BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY] = True store = BigTableStore( "bigtable://", MagicMock(), MagicMock(), options=options ) From b3dd33d1560aed6c7d88f0fbc39aa050ac199786 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 9 Oct 2023 15:21:31 +0200 Subject: [PATCH 525/616] changed flush log --- faust/stores/bigtable.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0e7698dad..71be5ed70 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -88,8 +88,8 @@ def __init__( self._set_options(options) try: self._setup_bigtable(table, options) - self._setup_caches() - self._setup_mutation_batcher() + self._setup_caches(options) + self._setup_mutation_batcher(options) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -99,9 +99,9 @@ def __init__( def default_translator(user_key): return user_key - def _on_mutation_batcher_flushed(self): + def _on_mutation_batcher_flushed(self, status): self.log.info( - f"Flushed mutation buffer for {self.table_name}" + f"Flushed {len(status)} mutations for {self.table_name}" ) def _setup_mutation_batcher(self, options): @@ -119,7 +119,7 @@ def _setup_mutation_batcher(self, options): self.bt_table, flush_count=flush_count, flush_interval=flush_interval, - batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(), + batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(x), ) def _setup_caches( From ec7550c33a04352d7899cbd656a373f00ec22df8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 10 Oct 2023 14:15:36 +0200 Subject: [PATCH 526/616] updated key and startup cache correct on startup --- faust/stores/bigtable.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 71be5ed70..9ea05c881 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -505,10 +505,16 @@ def apply_changelog_batch( if msg.value is None: self._bigtable_del(key, no_key_translation=True) + if self._startup_cache is not None: + self._startup_cache.pop(msg.key, None) + if self._key_cache is not None: + self._key_cache.discard(msg.key) else: self._bigtable_set(key, msg.value, no_key_translation=True) if self._startup_cache is not None: self._startup_cache[msg.key] = msg.value + if self._key_cache is not None: + self._key_cache.add(msg.key) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From ed32cc985ebfee2ea667d991070cc5a05353776a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 10 Oct 2023 14:46:41 +0200 Subject: [PATCH 527/616] check caches in _contains --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9ea05c881..6763ed4a4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -415,6 +415,10 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True + if self._key_cache is not None and key in self._key_cache: + return True + if self._startup_cache is not None and key in self._startup_cache: + return True # We don't want to invalidate the cache here # because it is very likely that we will need the value soon return self._get(key, invalidate_cache=False) is not None From 383594a5cc38207c3fcc10959f79b4cb1e4fca36 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 10 Oct 2023 15:34:11 +0200 Subject: [PATCH 528/616] added mutation cache --- faust/stores/bigtable.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6763ed4a4..9f7ba355f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -103,6 +103,7 @@ def _on_mutation_batcher_flushed(self, status): self.log.info( f"Flushed {len(status)} mutations for {self.table_name}" ) + self._mutation_cache.clear() def _setup_mutation_batcher(self, options): self._mutation_batcher_enable = options.get( @@ -121,6 +122,7 @@ def _setup_mutation_batcher(self, options): flush_interval=flush_interval, batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(x), ) + self._mutation_cache: Dict[bytes, Union[bytes, None]] = {} def _setup_caches( self, @@ -244,6 +246,10 @@ def _bigtable_get( ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) if self._mutation_batcher_enable: + for bt_key in keys: + value = self._get_mutation(bt_key) + if value is not None: + return value self._mutation_batcher.flush() for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) @@ -255,6 +261,10 @@ def _set_mutation( self, key: bytes, row: DirectRow, value: Optional[bytes] ): self._mutation_batcher.mutate(row) + self._mutation_cache[key] = value + + def _get_mutation(self, key: bytes) -> Optional[bytes]: + return self._mutation_cache.get(key, None) def _bigtable_del(self, key: bytes, no_key_translation=False): if no_key_translation: From 0f370bbb034731da4e5ea36149e2e992bd4d7448 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 10 Oct 2023 15:51:20 +0200 Subject: [PATCH 529/616] Revert "added mutation cache" This reverts commit 383594a5cc38207c3fcc10959f79b4cb1e4fca36. --- faust/stores/bigtable.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9f7ba355f..6763ed4a4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -103,7 +103,6 @@ def _on_mutation_batcher_flushed(self, status): self.log.info( f"Flushed {len(status)} mutations for {self.table_name}" ) - self._mutation_cache.clear() def _setup_mutation_batcher(self, options): self._mutation_batcher_enable = options.get( @@ -122,7 +121,6 @@ def _setup_mutation_batcher(self, options): flush_interval=flush_interval, batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(x), ) - self._mutation_cache: Dict[bytes, Union[bytes, None]] = {} def _setup_caches( self, @@ -246,10 +244,6 @@ def _bigtable_get( ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) if self._mutation_batcher_enable: - for bt_key in keys: - value = self._get_mutation(bt_key) - if value is not None: - return value self._mutation_batcher.flush() for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) @@ -261,10 +255,6 @@ def _set_mutation( self, key: bytes, row: DirectRow, value: Optional[bytes] ): self._mutation_batcher.mutate(row) - self._mutation_cache[key] = value - - def _get_mutation(self, key: bytes) -> Optional[bytes]: - return self._mutation_cache.get(key, None) def _bigtable_del(self, key: bytes, no_key_translation=False): if no_key_translation: From 6affe29ba64dc9ddb33a5dc94c0db77ac7c76134 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 11 Oct 2023 10:51:14 +0200 Subject: [PATCH 530/616] invalidate startup cache on set --- faust/stores/bigtable.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6763ed4a4..9ac5966a7 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -313,6 +313,9 @@ def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: + if self._startup_cache is not None: + # We want to invalidate the cache here + self._startup_cache.pop(key, None) if self._key_cache is not None: self._key_cache.add(key) self._bigtable_set(key, value) From 412004e97f3746d813efafa61705d104e7ba4eb0 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 11 Oct 2023 12:28:37 +0200 Subject: [PATCH 531/616] removed checks in _contains --- faust/stores/bigtable.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9ac5966a7..6f33a7f18 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -418,10 +418,6 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - if self._key_cache is not None and key in self._key_cache: - return True - if self._startup_cache is not None and key in self._startup_cache: - return True # We don't want to invalidate the cache here # because it is very likely that we will need the value soon return self._get(key, invalidate_cache=False) is not None From 1189e9bd487839cf2403bfb9f999746f0ddfcb14 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 11 Oct 2023 13:53:00 +0200 Subject: [PATCH 532/616] implemented new caching logic --- faust/stores/bigtable.py | 77 +++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6f33a7f18..57127a473 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,6 +1,7 @@ """BigTable storage.""" import logging import time +import threading import traceback from typing import ( Any, @@ -131,6 +132,10 @@ def _setup_caches( ) if self._startup_cache_enable: self._startup_cache: Dict[bytes, bytes] = {} + # Invalidate startup cache after 30 minutes + self._invalidation_timer = threading.Timer( + 30*60, self._invalidate_startup_cache + ) else: self._startup_cache = None @@ -296,14 +301,39 @@ def _bigtable_set( else: row.commit() - def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: + def _del_cache(self, key: bytes): + if self._startup_cache is not None: + self._startup_cache.pop(key, None) + if self._key_cache is not None: + self._key_cache.discard(key) + + def _set_cache(self, key: bytes, value): + if self._startup_cache is not None: + self._startup_cache[key] = value + if self._key_cache is not None: + self._key_cache.add(key) + + def _get_cache(self, key: bytes): + if self._startup_cache is not None: + if key in self._startup_cache: + return self._startup_cache[key], True + if self._key_cache is not None: + if key not in self._key_cache: + return None, True + return None, False + + def _invalidate_startup_cache(self): + if self._startup_cache is not None: + self._startup_cache.clear() + self._startup_cache = None + gc.collect() + self._invalidation_timer.cancel() + + def _get(self, key: bytes) -> Optional[bytes]: try: - if self._startup_cache is not None: - if key in self._startup_cache: - if invalidate_cache: - return self._startup_cache.pop(key) - else: - return self._startup_cache[key] + value, found = self._get_cache(key) + if found: + return value return self._bigtable_get(key) except Exception as ex: self.log.error( @@ -313,11 +343,7 @@ def _get(self, key: bytes, invalidate_cache=True) -> Optional[bytes]: def _set(self, key: bytes, value: Optional[bytes]) -> None: try: - if self._startup_cache is not None: - # We want to invalidate the cache here - self._startup_cache.pop(key, None) - if self._key_cache is not None: - self._key_cache.add(key) + self._set_cache(key, value) self._bigtable_set(key, value) except Exception as ex: self.log.error( @@ -329,10 +355,7 @@ def _set(self, key: bytes, value: Optional[bytes]) -> None: def _del(self, key: bytes) -> None: try: - if self._startup_cache is not None: - self._startup_cache.pop(key, None) - if self._key_cache is not None: - self._key_cache.discard(key) + self._del_cache(key) self._bigtable_del(key) except Exception as ex: self.log.error( @@ -418,10 +441,7 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - # We don't want to invalidate the cache here - # because it is very likely that we will need the value soon - return self._get(key, invalidate_cache=False) is not None - + return self._get(key) is not None except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " @@ -508,16 +528,10 @@ def apply_changelog_batch( if msg.value is None: self._bigtable_del(key, no_key_translation=True) - if self._startup_cache is not None: - self._startup_cache.pop(msg.key, None) - if self._key_cache is not None: - self._key_cache.discard(msg.key) + self._del_cache(key) else: self._bigtable_set(key, msg.value, no_key_translation=True) - if self._startup_cache is not None: - self._startup_cache[msg.key] = msg.value - if self._key_cache is not None: - self._key_cache.add(msg.key) + self._set_cache(key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) @@ -548,10 +562,9 @@ def restore_backup( def _fill_caches(self, partitions): for k, v in self._bigtable_iteritems(partitions=partitions): - if self._startup_cache is not None: - self._startup_cache[k] = v - if self._key_cache is not None: - self._key_cache.add(k) + self._set_cache(k, v) + if self._statup_cache is not None: + self._invalidation_timer.start() def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] From 35e58e58f46f65566bdb6cb38f3fc8f2308bbc16 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 11 Oct 2023 14:16:59 +0200 Subject: [PATCH 533/616] deleting the startup cache is only done by the invalidation timer --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 57127a473..600dfc16d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -248,8 +248,10 @@ def _bigtable_get( self, key: bytes, no_key_translation=False ) -> Optional[bytes]: keys = [key] if no_key_translation else self._get_possible_bt_keys(key) + if self._mutation_batcher_enable: self._mutation_batcher.flush() + for bt_key in keys: res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is not None: @@ -303,7 +305,7 @@ def _bigtable_set( def _del_cache(self, key: bytes): if self._startup_cache is not None: - self._startup_cache.pop(key, None) + self._startup_cache[key] = None if self._key_cache is not None: self._key_cache.discard(key) From 2b772b99ee1b6fb14206530a214934085d348325 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 11 Oct 2023 15:40:47 +0200 Subject: [PATCH 534/616] added logging for invalidating cache --- faust/stores/bigtable.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 600dfc16d..5def509bc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,4 +1,5 @@ """BigTable storage.""" +import gc import logging import time import threading @@ -100,11 +101,6 @@ def __init__( def default_translator(user_key): return user_key - def _on_mutation_batcher_flushed(self, status): - self.log.info( - f"Flushed {len(status)} mutations for {self.table_name}" - ) - def _setup_mutation_batcher(self, options): self._mutation_batcher_enable = options.get( BigTableStore.BT_MUTATION_BATCHER_ENABLE_KEY, False @@ -120,7 +116,9 @@ def _setup_mutation_batcher(self, options): self.bt_table, flush_count=flush_count, flush_interval=flush_interval, - batch_completed_callback=lambda x: self._on_mutation_batcher_flushed(x), + batch_completed_callback=lambda x: self._on_mutation_batcher_flushed( + x + ), ) def _setup_caches( @@ -134,7 +132,7 @@ def _setup_caches( self._startup_cache: Dict[bytes, bytes] = {} # Invalidate startup cache after 30 minutes self._invalidation_timer = threading.Timer( - 30*60, self._invalidate_startup_cache + 30 * 60, self._invalidate_startup_cache ) else: self._startup_cache = None @@ -258,11 +256,6 @@ def _bigtable_get( return self.bigtable_exrtact_row_data(res) return None - def _set_mutation( - self, key: bytes, row: DirectRow, value: Optional[bytes] - ): - self._mutation_batcher.mutate(row) - def _bigtable_del(self, key: bytes, no_key_translation=False): if no_key_translation: keys = [key] @@ -276,7 +269,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): row = self.bt_table.direct_row(key) row.delete() if self._mutation_batcher_enable: - self._set_mutation(key, row, None) + self._set_mutation(row) else: row.commit() @@ -299,7 +292,7 @@ def _bigtable_set( ) if self._mutation_batcher_enable: - self._set_mutation(key, row, value) + self._set_mutation(row) else: row.commit() @@ -329,13 +322,22 @@ def _invalidate_startup_cache(self): self._startup_cache.clear() self._startup_cache = None gc.collect() + self.log.info( + f"Invalidated startup cache for table {self.table_name}" + ) self._invalidation_timer.cancel() + def _on_mutation_batcher_flushed(self, status): + self.log.info(f"Flushed {len(status)} mutations for {self.table_name}") + + def _set_mutation(self, mutated_row: DirectRow): + self._mutation_batcher.mutate(mutated_row) + def _get(self, key: bytes) -> Optional[bytes]: try: - value, found = self._get_cache(key) - if found: - return value + # value, found = self._get_cache(key) + # if found: + # return value return self._bigtable_get(key) except Exception as ex: self.log.error( From a9a7d8996478185b443804638a5eb310713065e9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 12:06:42 +0200 Subject: [PATCH 535/616] add logging --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5def509bc..f4dbc3e3a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -231,6 +231,10 @@ def _get_current_partitions(self) -> Iterable[Optional[int]]: if event is not None: partition = event.message.partition return [partition] + self.log.info( + f"BigtableStore: _get_current_partitions: " + f"current_event is None for {self.table_name}" + ) return list(self._active_partitions()) def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: From f3b78837985d294ced332e5a2706de8efb4e5ab5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 13:20:43 +0200 Subject: [PATCH 536/616] added more logging --- faust/stores/bigtable.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f4dbc3e3a..668ff3b45 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -230,12 +230,15 @@ def _get_current_partitions(self) -> Iterable[Optional[int]]: event = current_event() if event is not None: partition = event.message.partition + return [partition] + partitions = list(self._active_partitions()) self.log.info( f"BigtableStore: _get_current_partitions: " - f"current_event is None for {self.table_name}" + f"current_event is None for {self.table_name} " + f"with {partitions=}" ) - return list(self._active_partitions()) + return partitions def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: partitions = self._get_current_partitions() @@ -258,6 +261,10 @@ def _bigtable_get( res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is not None: return self.bigtable_exrtact_row_data(res) + self.log.info( + f"BigtableStore: _bigtable_get: " + f"no row found for {self.table_name} " + f"for key {key} with {keys=}" return None def _bigtable_del(self, key: bytes, no_key_translation=False): From a6290e5f03db02e4f8146993501e260b28b55cea Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 13:29:25 +0200 Subject: [PATCH 537/616] fixed syntax error --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 668ff3b45..6add7ae68 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -265,6 +265,7 @@ def _bigtable_get( f"BigtableStore: _bigtable_get: " f"no row found for {self.table_name} " f"for key {key} with {keys=}" + ) return None def _bigtable_del(self, key: bytes, no_key_translation=False): From 8f1c84a1c2e492b73b340ec17104b59c4efe5903 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 13:51:53 +0200 Subject: [PATCH 538/616] fixed logging --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6add7ae68..1bec5c186 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -242,8 +242,10 @@ def _get_current_partitions(self) -> Iterable[Optional[int]]: def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: partitions = self._get_current_partitions() + bt_keys = [] for partition in partitions: - yield self._add_partition_prefix_to_key(key, partition) + bt_keys.append(self._add_partition_prefix_to_key(key, partition)) + return bt_keys @staticmethod def bigtable_exrtact_row_data(row_data): From 50a52ed2be2fe567720877c6d574eec455f25ef8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 14:53:35 +0200 Subject: [PATCH 539/616] if table is globa or uses partitioner, try all partitions --- faust/stores/bigtable.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1bec5c186..b45bf80d5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -225,19 +225,16 @@ def _get_all_possible_partitions(self) -> Iterable[Optional[int]]: return list(self._active_partitions()) def _get_current_partitions(self) -> Iterable[Optional[int]]: - if self.table.is_global or self.table.use_partitioner: - return [None] event = current_event() - if event is not None: + if ( + event is not None + and not self.table.is_global + and not self.table.use_partitioner + ): partition = event.message.partition return [partition] partitions = list(self._active_partitions()) - self.log.info( - f"BigtableStore: _get_current_partitions: " - f"current_event is None for {self.table_name} " - f"with {partitions=}" - ) return partitions def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: From 0c956fa11a05760dd7b84b17e8849709bb6c90b9 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 15:51:17 +0200 Subject: [PATCH 540/616] try differetn approach on _get_current_partitions --- faust/stores/bigtable.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b45bf80d5..4eb77620a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -209,20 +209,17 @@ def _get_partition_from_bigtable_key(self, key: bytes) -> int: partition_bytes, _ = key.rsplit(separator, 1) return int(partition_bytes) - def _active_partitions(self) -> Iterator[int]: + def _active_partitions(self) -> List[int]: actives = self.app.assignor.assigned_actives() topic = self.table.changelog_topic_name + partitions = [] for partition in range(self.app.conf.topic_partitions): tp = TP(topic=topic, partition=partition) # for global tables, keys from all # partitions are available. if tp in actives or self.table.is_global: - yield partition - - def _get_all_possible_partitions(self) -> Iterable[Optional[int]]: - if self.table.is_global or self.table.use_partitioner: - return [None] - return list(self._active_partitions()) + partitions.append(partition) + return partitions def _get_current_partitions(self) -> Iterable[Optional[int]]: event = current_event() @@ -232,10 +229,8 @@ def _get_current_partitions(self) -> Iterable[Optional[int]]: and not self.table.use_partitioner ): partition = event.message.partition - return [partition] - partitions = list(self._active_partitions()) - return partitions + return self._active_partitions() def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: partitions = self._get_current_partitions() @@ -271,7 +266,7 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): if no_key_translation: keys = [key] else: - partitions = self._get_all_possible_partitions() + partitions = self._active_partitions() keys = [ self._add_partition_prefix_to_key(key, p) for p in partitions ] @@ -384,7 +379,7 @@ def _bigtable_iteritems(self, partitions): try: start = time.time() if partitions is None: - partitions = list(self._active_partitions()) + partitions = self._active_partitions() row_set = RowSet() self.log.info( f"BigtableStore: Iterating over {len(partitions)} partitions " From 43c4cba54817cd1a661dfb0d2106a35dd76db9c6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 16:43:01 +0200 Subject: [PATCH 541/616] fixed wrong apply changelog batch --- faust/stores/bigtable.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4eb77620a..814ac190f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -530,18 +530,12 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - - if not (self.table.is_global or self.table.use_partitioner): - key = self._add_partition_prefix_to_key(msg.key, tp.partition) - else: - key = msg.key + key = msg.key if msg.value is None: - self._bigtable_del(key, no_key_translation=True) - self._del_cache(key) + self._del(key) else: - self._bigtable_set(key, msg.value, no_key_translation=True) - self._set_cache(key, msg.value) + self._set(key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From 10eea2058ffe4ec8cc18a129a053bd97e28217d7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 16:57:42 +0200 Subject: [PATCH 542/616] fixed apply changelog topics again --- faust/stores/bigtable.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 814ac190f..21702649b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -530,12 +530,14 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - key = msg.key + key = self._add_partition_prefix_to_key(msg.key, tp.partition) if msg.value is None: - self._del(key) + self._bigtable_del(key, no_key_translation=True) + self._del_cache(msg.key) else: - self._set(key, msg.value) + self._bigtable_set(key, msg.value, no_key_translation=True) + self._set_cache(msg.key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From 27f82debea9c93b15e5f1e7f0bcbefdc8c65de94 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 17:54:50 +0200 Subject: [PATCH 543/616] fixed error in set --- faust/stores/bigtable.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 21702649b..65a580511 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -282,13 +282,10 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): def _bigtable_set( self, key: bytes, value: bytes, no_key_translation=False ): - keys = ( - [key] - if no_key_translation - else list(self._get_possible_bt_keys(key)) - ) - assert len(keys) == 1 - key = keys[0] + event = current_event() + assert event is not None + partition = event.message.partition + key = self._add_partition_prefix_to_key(key, partition) row = self.bt_table.direct_row(key) row.set_cell( @@ -351,7 +348,7 @@ def _get(self, key: bytes) -> Optional[bytes]: ) raise ex - def _set(self, key: bytes, value: Optional[bytes]) -> None: + def _set(self, key: bytes, value: bytes) -> None: try: self._set_cache(key, value) self._bigtable_set(key, value) From 549a3b66d6f7cd7116874ad8fb6922d7abf17d40 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 17:57:13 +0200 Subject: [PATCH 544/616] use msg partition for adding data to table --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 65a580511..4a7947d8c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -527,7 +527,7 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - key = self._add_partition_prefix_to_key(msg.key, tp.partition) + key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: self._bigtable_del(key, no_key_translation=True) From 67cfe9e1b844934909db9765261fb994d3429ffc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 18:01:39 +0200 Subject: [PATCH 545/616] fix set for no key translation --- faust/stores/bigtable.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4a7947d8c..4aa70a223 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -282,12 +282,13 @@ def _bigtable_del(self, key: bytes, no_key_translation=False): def _bigtable_set( self, key: bytes, value: bytes, no_key_translation=False ): - event = current_event() - assert event is not None - partition = event.message.partition - key = self._add_partition_prefix_to_key(key, partition) - row = self.bt_table.direct_row(key) + if not no_key_translation: + event = current_event() + assert event is not None + partition = event.message.partition + key = self._add_partition_prefix_to_key(key, partition) + row = self.bt_table.direct_row(key) row.set_cell( COLUMN_FAMILY_ID, COLUMN_NAME, From af84131c30c8b22fdfc24bdab5aeb1001a8680eb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 18:04:05 +0200 Subject: [PATCH 546/616] fix partitioning --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 4aa70a223..1afd76c2d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -531,11 +531,11 @@ def apply_changelog_batch( key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: - self._bigtable_del(key, no_key_translation=True) self._del_cache(msg.key) + self._bigtable_del(key, no_key_translation=True) else: - self._bigtable_set(key, msg.value, no_key_translation=True) self._set_cache(msg.key, msg.value) + self._bigtable_set(key, msg.value, no_key_translation=True) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From 0f5f324da9f3b667411a8fc883446b0dcae4b63a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 12 Oct 2023 18:13:03 +0200 Subject: [PATCH 547/616] removed no row found log --- faust/stores/bigtable.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1afd76c2d..674c04564 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -255,11 +255,6 @@ def _bigtable_get( res = self.bt_table.read_row(bt_key, filter_=self.row_filter) if res is not None: return self.bigtable_exrtact_row_data(res) - self.log.info( - f"BigtableStore: _bigtable_get: " - f"no row found for {self.table_name} " - f"for key {key} with {keys=}" - ) return None def _bigtable_del(self, key: bytes, no_key_translation=False): From a63c754ed43d3658599c113a303f807d555cb5a2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 17 Oct 2023 13:21:46 +0200 Subject: [PATCH 548/616] changed set get delete and contains method --- faust/stores/bigtable.py | 140 +++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 79 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 674c04564..ef7deb5aa 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -206,8 +206,8 @@ def _remove_partition_prefix_from_bigtable_key(self, key: bytes) -> bytes: def _get_partition_from_bigtable_key(self, key: bytes) -> int: separator = b"_..._" - partition_bytes, _ = key.rsplit(separator, 1) - return int(partition_bytes) + partition_str, _ = key.rsplit(separator, 1) + return int(partition_str) def _active_partitions(self) -> List[int]: actives = self.app.assignor.assigned_actives() @@ -215,13 +215,12 @@ def _active_partitions(self) -> List[int]: partitions = [] for partition in range(self.app.conf.topic_partitions): tp = TP(topic=topic, partition=partition) - # for global tables, keys from all - # partitions are available. if tp in actives or self.table.is_global: partitions.append(partition) return partitions - def _get_current_partitions(self) -> Iterable[Optional[int]]: + + def _get_current_partitions(self) -> List[int]: event = current_event() if ( event is not None @@ -232,7 +231,8 @@ def _get_current_partitions(self) -> Iterable[Optional[int]]: return [partition] return self._active_partitions() - def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: + def _get_bigtable_keys(self, key: bytes) -> List[bytes]: + # TODO - Add key index here if needed partitions = self._get_current_partitions() bt_keys = [] for partition in partitions: @@ -243,58 +243,6 @@ def _get_possible_bt_keys(self, key: bytes) -> Iterable[bytes]: def bigtable_exrtact_row_data(row_data): return list(row_data.to_dict().values())[0][0].value - def _bigtable_get( - self, key: bytes, no_key_translation=False - ) -> Optional[bytes]: - keys = [key] if no_key_translation else self._get_possible_bt_keys(key) - - if self._mutation_batcher_enable: - self._mutation_batcher.flush() - - for bt_key in keys: - res = self.bt_table.read_row(bt_key, filter_=self.row_filter) - if res is not None: - return self.bigtable_exrtact_row_data(res) - return None - - def _bigtable_del(self, key: bytes, no_key_translation=False): - if no_key_translation: - keys = [key] - else: - partitions = self._active_partitions() - keys = [ - self._add_partition_prefix_to_key(key, p) for p in partitions - ] - - for key in keys: - row = self.bt_table.direct_row(key) - row.delete() - if self._mutation_batcher_enable: - self._set_mutation(row) - else: - row.commit() - - def _bigtable_set( - self, key: bytes, value: bytes, no_key_translation=False - ): - if not no_key_translation: - event = current_event() - assert event is not None - partition = event.message.partition - key = self._add_partition_prefix_to_key(key, partition) - - row = self.bt_table.direct_row(key) - row.set_cell( - COLUMN_FAMILY_ID, - COLUMN_NAME, - value, - ) - - if self._mutation_batcher_enable: - self._set_mutation(row) - else: - row.commit() - def _del_cache(self, key: bytes): if self._startup_cache is not None: self._startup_cache[key] = None @@ -332,21 +280,53 @@ def _on_mutation_batcher_flushed(self, status): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) + def _bigtable_get(self, keys: List[bytes]) -> Optional[bytes]: + rowset = RowSet() + for key in keys: + value, found = self._get_cache(key) + if found: + return value + rowset.add_row_key(key) + + if self._mutation_batcher_enable: + self._mutation_batcher.flush() + + rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) + for row in rows: + if row is not None: + return self.bigtable_exrtact_row_data(row) + return None + def _get(self, key: bytes) -> Optional[bytes]: try: - # value, found = self._get_cache(key) - # if found: - # return value - return self._bigtable_get(key) + keys = self._get_bigtable_keys(key) + return self._bigtable_get(keys) except Exception as ex: self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" ) raise ex + def _bigtable_set(self, key: bytes, value: bytes): + self._set_cache(key, value) + row = self.bt_table.direct_row(key) + row.set_cell( + COLUMN_FAMILY_ID, + COLUMN_NAME, + value, + ) + + if self._mutation_batcher_enable: + self._set_mutation(row) + else: + row.commit() + def _set(self, key: bytes, value: bytes) -> None: try: - self._set_cache(key, value) + event = current_event() + assert event is not None + partition = event.message.partition + key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) except Exception as ex: self.log.error( @@ -356,10 +336,21 @@ def _set(self, key: bytes, value: bytes) -> None: ) raise ex + def _bigtable_del(self, key: bytes): + + row = self.bt_table.direct_row(key) + row.delete() + self._del_cache(key) + if self._mutation_batcher_enable: + self._set_mutation(row) + else: + row.commit() + def _del(self, key: bytes) -> None: try: - self._del_cache(key) - self._bigtable_del(key) + keys = self._get_bigtable_keys(key) + for key in keys: + self._bigtable_del(key) except Exception as ex: self.log.error( f"FaustBigtableException Error in del for " @@ -383,12 +374,7 @@ def _bigtable_iteritems(self, partitions): if not need_all_keys: for partition in partitions: prefix = self._add_partition_prefix_to_key(b"", partition) - start_key = prefix + b"\x00" - end_key = prefix + b"\xff" - - row_set.add_row_range_from_keys( - start_key=start_key, end_key=end_key - ) + row_set.add_row_range_with_prefix(prefix) if self._mutation_batcher_enable: self._mutation_batcher.flush() @@ -478,7 +464,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset = self._bigtable_get(offset_key, no_key_translation=True) + offset = self._bigtable_get([offset_key]) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -491,9 +477,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: """ try: offset_key = self.get_offset_key(tp).encode() - self._bigtable_set( - offset_key, str(offset).encode(), no_key_translation=True - ) + self._bigtable_set(offset_key, str(offset).encode()) except Exception: self.log.error( f"Failed to commit offset for {self.table.name}" @@ -526,11 +510,9 @@ def apply_changelog_batch( key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: - self._del_cache(msg.key) - self._bigtable_del(key, no_key_translation=True) + self._bigtable_del(key) else: - self._set_cache(msg.key, msg.value) - self._bigtable_set(key, msg.value, no_key_translation=True) + self._bigtable_set(key, msg.value) for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From feb29373b4821471b30cb3f8f3e997a3001e1e72 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 17 Oct 2023 14:01:28 +0200 Subject: [PATCH 549/616] fixed error --- faust/stores/bigtable.py | 45 ++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ef7deb5aa..0a3400c8b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -16,6 +16,7 @@ Tuple, Union, ) +from mode.utils.collections import LRUCache try: # pragma: no cover from google.api_core.exceptions import AlreadyExists @@ -92,6 +93,9 @@ def __init__( self._setup_bigtable(table, options) self._setup_caches(options) self._setup_mutation_batcher(options) + key_index_size = app.conf.table_key_index_size + self.key_index_size = key_index_size + self._key_index = LRUCache(limit=self.key_index_size) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex @@ -145,6 +149,7 @@ def _setup_caches( else: self._key_cache = None + def _set_options(self, options) -> None: self._all_options = options self.table_name_generator = options.get( @@ -231,13 +236,10 @@ def _get_current_partitions(self) -> List[int]: return [partition] return self._active_partitions() - def _get_bigtable_keys(self, key: bytes) -> List[bytes]: - # TODO - Add key index here if needed - partitions = self._get_current_partitions() - bt_keys = [] - for partition in partitions: - bt_keys.append(self._add_partition_prefix_to_key(key, partition)) - return bt_keys + def _get_partitions_for_key(self, key: bytes) -> List[int]: + if key in self._key_index: + return [self._key_index[key]] + return self._get_current_partitions() @staticmethod def bigtable_exrtact_row_data(row_data): @@ -280,12 +282,13 @@ def _on_mutation_batcher_flushed(self, status): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) - def _bigtable_get(self, keys: List[bytes]) -> Optional[bytes]: + def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: rowset = RowSet() for key in keys: value, found = self._get_cache(key) if found: - return value + partition = self._get_partition_from_bigtable_key(key) + return value, partition rowset.add_row_key(key) if self._mutation_batcher_enable: @@ -294,13 +297,18 @@ def _bigtable_get(self, keys: List[bytes]) -> Optional[bytes]: rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) for row in rows: if row is not None: - return self.bigtable_exrtact_row_data(row) - return None + partition = self._get_partition_from_bigtable_key(row.row_key) + return self.bigtable_exrtact_row_data(row), partition + return None, None def _get(self, key: bytes) -> Optional[bytes]: try: - keys = self._get_bigtable_keys(key) - return self._bigtable_get(keys) + partitions = self._get_partitions_for_key(key) + keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] + value, partition = self._bigtable_get(keys) + if value is not None: + self._key_index[key] = partition + return value except Exception as ex: self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" @@ -328,6 +336,7 @@ def _set(self, key: bytes, value: bytes) -> None: partition = event.message.partition key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) + self._key_index[key] = partition except Exception as ex: self.log.error( f"FaustBigtableException Error in set for " @@ -337,7 +346,6 @@ def _set(self, key: bytes, value: bytes) -> None: raise ex def _bigtable_del(self, key: bytes): - row = self.bt_table.direct_row(key) row.delete() self._del_cache(key) @@ -348,8 +356,9 @@ def _bigtable_del(self, key: bytes): def _del(self, key: bytes) -> None: try: - keys = self._get_bigtable_keys(key) - for key in keys: + partitions = self._get_partitions_for_key(key) + for partition in partitions: + key = self._add_partition_prefix_to_key(key, partition) self._bigtable_del(key) except Exception as ex: self.log.error( @@ -373,7 +382,7 @@ def _bigtable_iteritems(self, partitions): need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: for partition in partitions: - prefix = self._add_partition_prefix_to_key(b"", partition) + prefix = self._add_partition_prefix_to_key(b"", partition).decode() row_set.add_row_range_with_prefix(prefix) if self._mutation_batcher_enable: @@ -464,7 +473,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset = self._bigtable_get([offset_key]) + offset, _ = self._bigtable_get([offset_key]) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: From 33c30451260704be732c6f05372723910ff2ca10 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 17 Oct 2023 14:17:05 +0200 Subject: [PATCH 550/616] fixed wrong offset key get --- faust/stores/bigtable.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0a3400c8b..c1ac9c378 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -282,12 +282,15 @@ def _on_mutation_batcher_flushed(self, status): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) - def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: + def _bigtable_get(self, keys: List[bytes], is_offset_key=False) -> Tuple[Optional[bytes], Optional[int]]: rowset = RowSet() for key in keys: value, found = self._get_cache(key) if found: - partition = self._get_partition_from_bigtable_key(key) + if is_offset_key: + partition = None + else: + partition = self._get_partition_from_bigtable_key(key) return value, partition rowset.add_row_key(key) @@ -297,7 +300,10 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) for row in rows: if row is not None: - partition = self._get_partition_from_bigtable_key(row.row_key) + if is_offset_key: + partition = None + else: + partition = self._get_partition_from_bigtable_key(row.row_key) return self.bigtable_exrtact_row_data(row), partition return None, None @@ -473,7 +479,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset, _ = self._bigtable_get([offset_key]) + offset, _ = self._bigtable_get([offset_key], is_offset_key=True) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: From b4445464f03a04b186900eae95b6145ab3cef695 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 17 Oct 2023 14:20:11 +0200 Subject: [PATCH 551/616] get offset from bt always --- faust/stores/bigtable.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c1ac9c378..b1112c088 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -287,10 +287,7 @@ def _bigtable_get(self, keys: List[bytes], is_offset_key=False) -> Tuple[Optiona for key in keys: value, found = self._get_cache(key) if found: - if is_offset_key: - partition = None - else: - partition = self._get_partition_from_bigtable_key(key) + partition = self._get_partition_from_bigtable_key(key) return value, partition rowset.add_row_key(key) @@ -300,10 +297,7 @@ def _bigtable_get(self, keys: List[bytes], is_offset_key=False) -> Tuple[Optiona rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) for row in rows: if row is not None: - if is_offset_key: - partition = None - else: - partition = self._get_partition_from_bigtable_key(row.row_key) + partition = self._get_partition_from_bigtable_key(row.row_key) return self.bigtable_exrtact_row_data(row), partition return None, None @@ -479,7 +473,10 @@ def persisted_offset(self, tp: TP) -> Optional[int]: See :meth:`set_persisted_offset`. """ offset_key = self.get_offset_key(tp).encode() - offset, _ = self._bigtable_get([offset_key], is_offset_key=True) + if self._mutation_batcher_enable: + self._mutation_batcher.flush() + row = self.bt_table.read_row(offset_key, filter_=self.row_filter) + offset = self.bigtable_exrtact_row_data(row) if row is not None else None return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: From 692fd1b85e75bec9c0d7166b9b4f02a1bee8bcdc Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 17 Oct 2023 15:13:34 +0200 Subject: [PATCH 552/616] added logging msg for missed key --- faust/stores/bigtable.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b1112c088..b24d186f0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -282,7 +282,7 @@ def _on_mutation_batcher_flushed(self, status): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) - def _bigtable_get(self, keys: List[bytes], is_offset_key=False) -> Tuple[Optional[bytes], Optional[int]]: + def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: rowset = RowSet() for key in keys: value, found = self._get_cache(key) @@ -299,6 +299,10 @@ def _bigtable_get(self, keys: List[bytes], is_offset_key=False) -> Tuple[Optiona if row is not None: partition = self._get_partition_from_bigtable_key(row.row_key) return self.bigtable_exrtact_row_data(row), partition + self.log.info( + "BigTableStore: No data found for keys " + f"{keys} in table {self.table_name}" + ) return None, None def _get(self, key: bytes) -> Optional[bytes]: From ccc5226f9390b9ebb8994968744620eed039a9bb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 08:56:03 +0200 Subject: [PATCH 553/616] removed unused function --- faust/stores/bigtable.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b24d186f0..a55d1ffdf 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -50,12 +50,6 @@ class BT: from faust.types import TP, AppT, CollectionT, EventT -def get_current_partition(): - event = current_event() - assert event is not None - return event.message.partition - - COLUMN_FAMILY_ID = "FaustColumnFamily" COLUMN_NAME = "DATA" From d59cf705ef7e4dd36094e73442d7924d5398eb33 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 09:05:57 +0200 Subject: [PATCH 554/616] moved log --- faust/stores/bigtable.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a55d1ffdf..90bbd821e 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -293,10 +293,6 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in if row is not None: partition = self._get_partition_from_bigtable_key(row.row_key) return self.bigtable_exrtact_row_data(row), partition - self.log.info( - "BigTableStore: No data found for keys " - f"{keys} in table {self.table_name}" - ) return None, None def _get(self, key: bytes) -> Optional[bytes]: @@ -306,6 +302,11 @@ def _get(self, key: bytes) -> Optional[bytes]: value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition + else: + self.log.info( + "BigTableStore: No data found for keys " + f"{keys} in table {self.table_name}" + ) return value except Exception as ex: self.log.error( @@ -437,7 +438,13 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - return self._get(key) is not None + partitions = self._get_partitions_for_key(key) + keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] + value, partition = self._bigtable_get(keys) + found = value is not None + if found: + self._key_index[key] = partition + return found except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From 4ebaafe695c1134b8bc47c06c2b126802458b00f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 09:16:53 +0200 Subject: [PATCH 555/616] call get in contains --- faust/stores/bigtable.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 90bbd821e..729cdc55c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -438,13 +438,7 @@ def _contains(self, key: bytes) -> bool: try: if not self.app.conf.store_check_exists: return True - partitions = self._get_partitions_for_key(key) - keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] - value, partition = self._bigtable_get(keys) - found = value is not None - if found: - self._key_index[key] = partition - return found + return self._get(key) is not None except Exception as ex: self.log.error( f"FaustBigtableException Error in _contains for table " From 1818c442ad4bac8e210a42060ef745f36b7347e3 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 09:57:48 +0200 Subject: [PATCH 556/616] try different logging REVERT ME --- faust/stores/bigtable.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 729cdc55c..f5a42c7f9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -297,16 +297,39 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - partitions = self._get_partitions_for_key(key) + event = current_event() + if key in self._key_index: + partitions = [self._key_index[key]] + else: + if ( + event is not None + and not self.table.is_global + and not self.table.use_partitioner + ): + partition = event.message.partition + partitions = [partition] + else: + actives = self.app.assignor.assigned_actives() + topic = self.table.changelog_topic_name + partitions = [] + for partition in range(self.app.conf.topic_partitions): + tp = TP(topic=topic, partition=partition) + if tp in actives or self.table.is_global: + partitions.append(partition) keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition else: - self.log.info( - "BigTableStore: No data found for keys " - f"{keys} in table {self.table_name}" - ) + if event is not None: + self.log.info( + "BigTableStore: No data found for " + f"key {key} " + f"event.partition {event.message.partition} " + f"event.key {event.key}" + f"bt_keys {keys} " + f"in table {self.table_name} " + ) return value except Exception as ex: self.log.error( From b8ef8a203bd8226451fad8ef7ce999e973cfc41c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 10:11:06 +0200 Subject: [PATCH 557/616] Revert "try different logging REVERT ME" This reverts commit 1818c442ad4bac8e210a42060ef745f36b7347e3. --- faust/stores/bigtable.py | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f5a42c7f9..729cdc55c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -297,39 +297,16 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - event = current_event() - if key in self._key_index: - partitions = [self._key_index[key]] - else: - if ( - event is not None - and not self.table.is_global - and not self.table.use_partitioner - ): - partition = event.message.partition - partitions = [partition] - else: - actives = self.app.assignor.assigned_actives() - topic = self.table.changelog_topic_name - partitions = [] - for partition in range(self.app.conf.topic_partitions): - tp = TP(topic=topic, partition=partition) - if tp in actives or self.table.is_global: - partitions.append(partition) + partitions = self._get_partitions_for_key(key) keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition else: - if event is not None: - self.log.info( - "BigTableStore: No data found for " - f"key {key} " - f"event.partition {event.message.partition} " - f"event.key {event.key}" - f"bt_keys {keys} " - f"in table {self.table_name} " - ) + self.log.info( + "BigTableStore: No data found for keys " + f"{keys} in table {self.table_name}" + ) return value except Exception as ex: self.log.error( From 5405ec79cf04a27d9e250793160a38ab57da371b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 10:23:31 +0200 Subject: [PATCH 558/616] Revert "Revert "try different logging REVERT ME"" This reverts commit b8ef8a203bd8226451fad8ef7ce999e973cfc41c. --- faust/stores/bigtable.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 729cdc55c..f5a42c7f9 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -297,16 +297,39 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - partitions = self._get_partitions_for_key(key) + event = current_event() + if key in self._key_index: + partitions = [self._key_index[key]] + else: + if ( + event is not None + and not self.table.is_global + and not self.table.use_partitioner + ): + partition = event.message.partition + partitions = [partition] + else: + actives = self.app.assignor.assigned_actives() + topic = self.table.changelog_topic_name + partitions = [] + for partition in range(self.app.conf.topic_partitions): + tp = TP(topic=topic, partition=partition) + if tp in actives or self.table.is_global: + partitions.append(partition) keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition else: - self.log.info( - "BigTableStore: No data found for keys " - f"{keys} in table {self.table_name}" - ) + if event is not None: + self.log.info( + "BigTableStore: No data found for " + f"key {key} " + f"event.partition {event.message.partition} " + f"event.key {event.key}" + f"bt_keys {keys} " + f"in table {self.table_name} " + ) return value except Exception as ex: self.log.error( From 6590c689e6b78868e5f0c8221021032d4ab4fe01 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 10:25:45 +0200 Subject: [PATCH 559/616] more logging --- faust/stores/bigtable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index f5a42c7f9..d216ec28b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -326,7 +326,9 @@ def _get(self, key: bytes) -> Optional[bytes]: "BigTableStore: No data found for " f"key {key} " f"event.partition {event.message.partition} " - f"event.key {event.key}" + f"event.message.tp.partition {event.message.tp.partition} "} + f"event.message.topic {event.message.topic} "} + f"event.key {event.key} " f"bt_keys {keys} " f"in table {self.table_name} " ) From cf25c2a561731d10d9113fac6ea00551be752dd8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 10:32:05 +0200 Subject: [PATCH 560/616] fixed syntax --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d216ec28b..733d19ca5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -326,8 +326,8 @@ def _get(self, key: bytes) -> Optional[bytes]: "BigTableStore: No data found for " f"key {key} " f"event.partition {event.message.partition} " - f"event.message.tp.partition {event.message.tp.partition} "} - f"event.message.topic {event.message.topic} "} + f"event.message.tp.partition {event.message.tp.partition} " + f"event.message.topic {event.message.topic} " f"event.key {event.key} " f"bt_keys {keys} " f"in table {self.table_name} " From 898a06127eb2acb574016598519e62078bc550c2 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 11:00:53 +0200 Subject: [PATCH 561/616] more logging --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 733d19ca5..139caed79 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -327,6 +327,7 @@ def _get(self, key: bytes) -> Optional[bytes]: f"key {key} " f"event.partition {event.message.partition} " f"event.message.tp.partition {event.message.tp.partition} " + f"event.message.tp.topic {event.message.tp.topic} " f"event.message.topic {event.message.topic} " f"event.key {event.key} " f"bt_keys {keys} " From b84144d169cc883b3dc8c4eb946444aaa04df99f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 11:12:02 +0200 Subject: [PATCH 562/616] assert no topic in set --- faust/stores/bigtable.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 139caed79..e05722b91 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -303,6 +303,7 @@ def _get(self, key: bytes) -> Optional[bytes]: else: if ( event is not None + and event.message.topic is not None and not self.table.is_global and not self.table.use_partitioner ): @@ -358,6 +359,7 @@ def _set(self, key: bytes, value: bytes) -> None: try: event = current_event() assert event is not None + assert event.message.topic is not None partition = event.message.partition key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) From 9d3acd9994a1a94d4bd59c50c46f1174760f7bbb Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 11:12:32 +0200 Subject: [PATCH 563/616] add check for topc in event when searching for current event --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e05722b91..e72114e23 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -223,6 +223,7 @@ def _get_current_partitions(self) -> List[int]: event = current_event() if ( event is not None + and event.message.topic is not None and not self.table.is_global and not self.table.use_partitioner ): From 0f3022cb24e399de6c33fc2ec9bf1295dbde9a66 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 11:34:32 +0200 Subject: [PATCH 564/616] removed assert --- faust/stores/bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index e72114e23..471e2ea16 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -360,7 +360,6 @@ def _set(self, key: bytes, value: bytes) -> None: try: event = current_event() assert event is not None - assert event.message.topic is not None partition = event.message.partition key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) From 2ce0a328934c3706a2000edb667e563fac276b9d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 11:48:29 +0200 Subject: [PATCH 565/616] removed flush log --- faust/stores/bigtable.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 471e2ea16..45e90702a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -114,9 +114,6 @@ def _setup_mutation_batcher(self, options): self.bt_table, flush_count=flush_count, flush_interval=flush_interval, - batch_completed_callback=lambda x: self._on_mutation_batcher_flushed( - x - ), ) def _setup_caches( @@ -271,9 +268,6 @@ def _invalidate_startup_cache(self): ) self._invalidation_timer.cancel() - def _on_mutation_batcher_flushed(self, status): - self.log.info(f"Flushed {len(status)} mutations for {self.table_name}") - def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) From 14836fc46d95153ae3ef9b7f8b6b875586145d97 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 18 Oct 2023 14:11:54 +0200 Subject: [PATCH 566/616] removed log and temporary debugging --- faust/stores/bigtable.py | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 45e90702a..86ffa1ab5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -292,43 +292,11 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - event = current_event() - if key in self._key_index: - partitions = [self._key_index[key]] - else: - if ( - event is not None - and event.message.topic is not None - and not self.table.is_global - and not self.table.use_partitioner - ): - partition = event.message.partition - partitions = [partition] - else: - actives = self.app.assignor.assigned_actives() - topic = self.table.changelog_topic_name - partitions = [] - for partition in range(self.app.conf.topic_partitions): - tp = TP(topic=topic, partition=partition) - if tp in actives or self.table.is_global: - partitions.append(partition) + partitions = self._get_partitions_for_key(key) keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition - else: - if event is not None: - self.log.info( - "BigTableStore: No data found for " - f"key {key} " - f"event.partition {event.message.partition} " - f"event.message.tp.partition {event.message.tp.partition} " - f"event.message.tp.topic {event.message.tp.topic} " - f"event.message.topic {event.message.topic} " - f"event.key {event.key} " - f"bt_keys {keys} " - f"in table {self.table_name} " - ) return value except Exception as ex: self.log.error( From 4a76decc16d8f0aa83880c984303fe9bf1ad1757 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 19 Oct 2023 14:56:40 +0200 Subject: [PATCH 567/616] removed keycache and refactored value cache --- faust/stores/bigtable.py | 64 ++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 86ffa1ab5..8a723ff38 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -125,21 +125,10 @@ def _setup_caches( ) if self._startup_cache_enable: self._startup_cache: Dict[bytes, bytes] = {} - # Invalidate startup cache after 30 minutes - self._invalidation_timer = threading.Timer( - 30 * 60, self._invalidate_startup_cache - ) + self._invalidation_timer: Optional[threading.Timer] = None else: self._startup_cache = None - self._key_cache_enable = options.get( - BigTableStore.BT_KEY_CACHE_ENABLE_KEY, False - ) - if self._key_cache_enable: - self._key_cache: Set[bytes] = set() - else: - self._key_cache = None - def _set_options(self, options) -> None: self._all_options = options @@ -240,22 +229,15 @@ def bigtable_exrtact_row_data(row_data): def _del_cache(self, key: bytes): if self._startup_cache is not None: self._startup_cache[key] = None - if self._key_cache is not None: - self._key_cache.discard(key) def _set_cache(self, key: bytes, value): if self._startup_cache is not None: self._startup_cache[key] = value - if self._key_cache is not None: - self._key_cache.add(key) def _get_cache(self, key: bytes): if self._startup_cache is not None: if key in self._startup_cache: return self._startup_cache[key], True - if self._key_cache is not None: - if key not in self._key_cache: - return None, True return None, False def _invalidate_startup_cache(self): @@ -266,7 +248,9 @@ def _invalidate_startup_cache(self): self.log.info( f"Invalidated startup cache for table {self.table_name}" ) - self._invalidation_timer.cancel() + self._invalidation_timer.start() + del self._invalidation_timer + self._invalidation_timer = None def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) @@ -274,12 +258,7 @@ def _set_mutation(self, mutated_row: DirectRow): def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: rowset = RowSet() for key in keys: - value, found = self._get_cache(key) - if found: - partition = self._get_partition_from_bigtable_key(key) - return value, partition rowset.add_row_key(key) - if self._mutation_batcher_enable: self._mutation_batcher.flush() @@ -292,7 +271,12 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: + value, found = self._get_cache(key) + if found: + return value + partitions = self._get_partitions_for_key(key) + keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: @@ -305,7 +289,6 @@ def _get(self, key: bytes) -> Optional[bytes]: raise ex def _bigtable_set(self, key: bytes, value: bytes): - self._set_cache(key, value) row = self.bt_table.direct_row(key) row.set_cell( COLUMN_FAMILY_ID, @@ -320,10 +303,13 @@ def _bigtable_set(self, key: bytes, value: bytes): def _set(self, key: bytes, value: bytes) -> None: try: + self._set_cache(key, value) + event = current_event() assert event is not None partition = event.message.partition key = self._add_partition_prefix_to_key(key, partition) + self._bigtable_set(key, value) self._key_index[key] = partition except Exception as ex: @@ -337,7 +323,6 @@ def _set(self, key: bytes, value: bytes) -> None: def _bigtable_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() - self._del_cache(key) if self._mutation_batcher_enable: self._set_mutation(row) else: @@ -345,6 +330,7 @@ def _bigtable_del(self, key: bytes): def _del(self, key: bytes) -> None: try: + self._del_cache(key) partitions = self._get_partitions_for_key(key) for partition in partitions: key = self._add_partition_prefix_to_key(key, partition) @@ -409,12 +395,8 @@ def _iteritems( yield from self._bigtable_iteritems(partitions) def _iterkeys(self) -> Iterator[bytes]: - if self._key_cache is not None: - for key in self._key_cache: - yield key - else: - for row in self._iteritems(): - yield row[0] + for row in self._iteritems(): + yield row[0] def _itervalues(self) -> Iterator[bytes]: for row in self._iteritems(): @@ -545,13 +527,23 @@ def restore_backup( def _fill_caches(self, partitions): for k, v in self._bigtable_iteritems(partitions=partitions): self._set_cache(k, v) - if self._statup_cache is not None: - self._invalidation_timer.start() + + # Invalidate startup cache after 30 minutes + # or reset the timer if already running + if self._startup_cache is not None: + if self._invalidation_timer is not None: + self._invalidation_timer.cancel() + self._invalidation_timer.start() + else: + self._invalidation_timer = threading.Timer( + 30 * 60, self._invalidate_startup_cache + ) + self._invalidation_timer.start() def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] ) -> Set[int]: - if self._startup_cache is None and self._key_cache is None: + if self._startup_cache is None: return set() partitions = set() From a181095c5ccb1f84de5d88dfa990e38b067d019a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 19 Oct 2023 14:59:17 +0200 Subject: [PATCH 568/616] formatted --- faust/stores/bigtable.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8a723ff38..c2f485676 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -129,7 +129,6 @@ def _setup_caches( else: self._startup_cache = None - def _set_options(self, options) -> None: self._all_options = options self.table_name_generator = options.get( @@ -204,7 +203,6 @@ def _active_partitions(self) -> List[int]: partitions.append(partition) return partitions - def _get_current_partitions(self) -> List[int]: event = current_event() if ( @@ -255,7 +253,9 @@ def _invalidate_startup_cache(self): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) - def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: + def _bigtable_get( + self, keys: List[bytes] + ) -> Tuple[Optional[bytes], Optional[int]]: rowset = RowSet() for key in keys: rowset.add_row_key(key) @@ -277,7 +277,9 @@ def _get(self, key: bytes) -> Optional[bytes]: partitions = self._get_partitions_for_key(key) - keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] + keys = [ + self._add_partition_prefix_to_key(key, p) for p in partitions + ] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition @@ -357,7 +359,9 @@ def _bigtable_iteritems(self, partitions): need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: for partition in partitions: - prefix = self._add_partition_prefix_to_key(b"", partition).decode() + prefix = self._add_partition_prefix_to_key( + b"", partition + ).decode() row_set.add_row_range_with_prefix(prefix) if self._mutation_batcher_enable: @@ -447,7 +451,9 @@ def persisted_offset(self, tp: TP) -> Optional[int]: if self._mutation_batcher_enable: self._mutation_batcher.flush() row = self.bt_table.read_row(offset_key, filter_=self.row_filter) - offset = self.bigtable_exrtact_row_data(row) if row is not None else None + offset = ( + self.bigtable_exrtact_row_data(row) if row is not None else None + ) return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: From 0a2b58ed9636c109d4d3220ef11625cd29e3c3f1 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 19 Oct 2023 15:30:16 +0200 Subject: [PATCH 569/616] flush mutation buffer on stop --- faust/stores/bigtable.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c2f485676..7e733e383 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -587,3 +587,8 @@ async def on_rebalance( generation_id: the metadata generation identifier for the re-balance """ await self.assign_partitions(self.table, newly_assigned, generation_id) + + async def stop(self) -> None: + if self._mutation_batcher_enable: + self.log.info("Flushing to bigtable on stop") + self._mutation_batcher.flush() From 9e234bcc992069874aa7029b63610bd6d3b2fd41 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 19 Oct 2023 15:41:07 +0200 Subject: [PATCH 570/616] always start new thread --- faust/stores/bigtable.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7e733e383..ffd9924c4 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -539,12 +539,10 @@ def _fill_caches(self, partitions): if self._startup_cache is not None: if self._invalidation_timer is not None: self._invalidation_timer.cancel() - self._invalidation_timer.start() - else: - self._invalidation_timer = threading.Timer( - 30 * 60, self._invalidate_startup_cache - ) - self._invalidation_timer.start() + self._invalidation_timer = threading.Timer( + 30 * 60, self._invalidate_startup_cache + ) + self._invalidation_timer.start() def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] From 696fb9fc12b1118e97b602d8a3c6f4834b55ea3e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 19 Oct 2023 15:52:08 +0200 Subject: [PATCH 571/616] don't start in invalidate startup cache --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ffd9924c4..c3450de6c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -246,7 +246,7 @@ def _invalidate_startup_cache(self): self.log.info( f"Invalidated startup cache for table {self.table_name}" ) - self._invalidation_timer.start() + self._invalidation_timer.cancel() del self._invalidation_timer self._invalidation_timer = None From b8cf623662cb770ed25812de5251b059d74485e4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 23 Oct 2023 07:35:10 +0200 Subject: [PATCH 572/616] delete invalidation timer --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index c3450de6c..3da00cba2 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -539,6 +539,7 @@ def _fill_caches(self, partitions): if self._startup_cache is not None: if self._invalidation_timer is not None: self._invalidation_timer.cancel() + del self._invalidation_timer self._invalidation_timer = threading.Timer( 30 * 60, self._invalidate_startup_cache ) From c0b6392d8059402cec62c93683d9f8f1dc84587a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 23 Oct 2023 07:36:34 +0200 Subject: [PATCH 573/616] set invalidation timer to None --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 3da00cba2..92efa3768 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -540,6 +540,7 @@ def _fill_caches(self, partitions): if self._invalidation_timer is not None: self._invalidation_timer.cancel() del self._invalidation_timer + self._invalidation_timer = None self._invalidation_timer = threading.Timer( 30 * 60, self._invalidate_startup_cache ) From 7c01ebf4a0a33c11ae966a428305ffb8935585ab Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 23 Oct 2023 18:16:34 +0200 Subject: [PATCH 574/616] fill caches after reading changelog --- faust/stores/bigtable.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 92efa3768..8d8a22760 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -496,12 +496,15 @@ def apply_changelog_batch( offset if tp not in tp_offsets else max(offset, tp_offsets[tp]) ) msg = event.message - key = self._add_partition_prefix_to_key(msg.key, msg.partition) + bt_key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: - self._bigtable_del(key) + self._del_cache(msg.key) + self._bigtable_del(bt_key) else: - self._bigtable_set(key, msg.value) + self._set_cache(msg.key, msg.value) + self._bigtable_set(bt_key, msg.value) + for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From d45808dd1dcb0a31bc0e66fc35d43e876349ef97 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 2 Nov 2023 15:00:12 +0100 Subject: [PATCH 575/616] add startupcache partitions for faster iteritems --- faust/stores/bigtable.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 8d8a22760..21e93567d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -40,7 +40,8 @@ class BT: RowSet = RowSet Table = Table -except ImportError: # pragma: no cover +except ImportError as e: # pragma: no cover + logger = logging.getLogger(__name__).error(e) BT = None # noqa from yarl import URL @@ -125,8 +126,10 @@ def _setup_caches( ) if self._startup_cache_enable: self._startup_cache: Dict[bytes, bytes] = {} + self._startup_cache_partitions = set() self._invalidation_timer: Optional[threading.Timer] = None else: + self._startup_cache_partitions = None self._startup_cache = None def _set_options(self, options) -> None: @@ -242,6 +245,7 @@ def _invalidate_startup_cache(self): if self._startup_cache is not None: self._startup_cache.clear() self._startup_cache = None + self._startup_cache_partitions = None gc.collect() self.log.info( f"Invalidated startup cache for table {self.table_name}" @@ -396,7 +400,18 @@ def _bigtable_iteritems(self, partitions): def _iteritems( self, partitions: Optional[List[int]] = None ) -> Iterator[Tuple[bytes, bytes]]: - yield from self._bigtable_iteritems(partitions) + if self._startup_cache is not None: + if partitions is None: + partitions: List[int] = self._active_partitions() + for k, v in self._startup_cache.items(): + if v is not None: + yield k, v + partitions = set(partitions) + partitions = partitions.difference(self._startup_cache_partitions) + + for key, val in self._bigtable_iteritems(partitions): + self._set_cache(key, val) + yield key, val def _iterkeys(self) -> Iterator[bytes]: for row in self._iteritems(): @@ -537,6 +552,10 @@ def _fill_caches(self, partitions): for k, v in self._bigtable_iteritems(partitions=partitions): self._set_cache(k, v) + if self._startup_cache_partitions is not None: + self._startup_cache_partitions = self._startup_cache_partitions.union( + partitions + ) # Invalidate startup cache after 30 minutes # or reset the timer if already running if self._startup_cache is not None: From eacf2cf48918cbb9d81170ef25b9d0f436eab2d5 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 2 Nov 2023 15:01:32 +0100 Subject: [PATCH 576/616] only yield from bigtable if partitions are left over --- faust/stores/bigtable.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 21e93567d..5ba20493a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -409,9 +409,10 @@ def _iteritems( partitions = set(partitions) partitions = partitions.difference(self._startup_cache_partitions) - for key, val in self._bigtable_iteritems(partitions): - self._set_cache(key, val) - yield key, val + if partitions is None or len(partitions) > 0: + for key, val in self._bigtable_iteritems(partitions): + self._set_cache(key, val) + yield key, val def _iterkeys(self) -> Iterator[bytes]: for row in self._iteritems(): From 8034689205dd9a4fb3e511a85d002b8653713f5a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 2 Nov 2023 15:03:36 +0100 Subject: [PATCH 577/616] fixed typing --- faust/stores/bigtable.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 5ba20493a..595da2254 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -402,7 +402,7 @@ def _iteritems( ) -> Iterator[Tuple[bytes, bytes]]: if self._startup_cache is not None: if partitions is None: - partitions: List[int] = self._active_partitions() + partitions: Iterable[int] = self._active_partitions() for k, v in self._startup_cache.items(): if v is not None: yield k, v @@ -410,9 +410,7 @@ def _iteritems( partitions = partitions.difference(self._startup_cache_partitions) if partitions is None or len(partitions) > 0: - for key, val in self._bigtable_iteritems(partitions): - self._set_cache(key, val) - yield key, val + yield from self._bigtable_iteritems(partitions): def _iterkeys(self) -> Iterator[bytes]: for row in self._iteritems(): From 73510af879a4c1c40e0ca7d787355f57dbe7a441 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 2 Nov 2023 15:21:52 +0100 Subject: [PATCH 578/616] fix syntax error --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 595da2254..b022971d0 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -410,7 +410,7 @@ def _iteritems( partitions = partitions.difference(self._startup_cache_partitions) if partitions is None or len(partitions) > 0: - yield from self._bigtable_iteritems(partitions): + yield from self._bigtable_iteritems(partitions) def _iterkeys(self) -> Iterator[bytes]: for row in self._iteritems(): From d0854394f9aa902d9ad823decc1084a0e4bf7dec Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 2 Nov 2023 15:27:09 +0100 Subject: [PATCH 579/616] removed type declaration --- faust/stores/bigtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b022971d0..40280f8cb 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -402,7 +402,7 @@ def _iteritems( ) -> Iterator[Tuple[bytes, bytes]]: if self._startup_cache is not None: if partitions is None: - partitions: Iterable[int] = self._active_partitions() + partitions = self._active_partitions() for k, v in self._startup_cache.items(): if v is not None: yield k, v From 75c0ee7054be29862b9e27187bdfd7ed50819354 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 6 Nov 2023 10:01:55 +0100 Subject: [PATCH 580/616] added ttl setting --- faust/stores/bigtable.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 40280f8cb..bd9798574 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -68,7 +68,7 @@ class BigTableStore(base.SerializedStore): BT_PROJECT_KEY = "bt_project_key" BT_TABLE_NAME_GENERATOR_KEY = "bt_table_name_generator_key" BT_STARTUP_CACHE_ENABLE_KEY = "bt_startup_cache_enable_key" - BT_KEY_CACHE_ENABLE_KEY = "bt_key_cache_enable_key" + BT_STARTUP_CACHE_TTL_KEY = "bt_startup_cache_ttl_key" BT_MUTATION_BATCHER_ENABLE_KEY = "bt_mutation_batcher_enable_key" BT_MUTATION_BATCHER_FLUSH_COUNT_KEY = "bt_mutation_batcher_flush_count_key" BT_MUTATION_BATCHER_FLUSH_INTERVAL_KEY = ( @@ -125,6 +125,12 @@ def _setup_caches( BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY, False ) if self._startup_cache_enable: + self._startup_cache_ttl = options.get( + BigTableStore.BT_STARTUP_CACHE_TTL_KEY, 30 * 60 + ) + if self._startup_cache_ttl == 0: + raise ValueError(f"Invalid {self._startup_cache_ttl=}") + self._startup_cache: Dict[bytes, bytes] = {} self._startup_cache_partitions = set() self._invalidation_timer: Optional[threading.Timer] = None @@ -563,7 +569,7 @@ def _fill_caches(self, partitions): del self._invalidation_timer self._invalidation_timer = None self._invalidation_timer = threading.Timer( - 30 * 60, self._invalidate_startup_cache + self._startup_cache_ttl, self._invalidate_startup_cache ) self._invalidation_timer.start() From de30378d73594f1e2a5c04ded905e2b4b8ad5420 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 8 Nov 2023 14:35:36 +0100 Subject: [PATCH 581/616] removed all unit tests --- tests/unit/stores/test_bigtable.py | 432 ----------------------------- 1 file changed, 432 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index ebff4835d..e7cbf39ab 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -120,435 +120,3 @@ class TestBigTableStore: TEST_KEY4 = b"\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d" TEST_KEY5 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2" TEST_KEY6 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02" - - @pytest.fixture() - def bt_imports(self): - with patch("faust.stores.bigtable.BT") as bt: - bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock( - return_value="a_rule" - ) - bt.RowSet = MagicMock(return_value=RowSetMock()) - yield bt - - @pytest.mark.asyncio - async def test_bigtable_set_options_default(self, bt_imports): - self_mock = MagicMock() - bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - - BigTableStore._set_options(self_mock, options={}) - assert self_mock.offset_key_prefix == "==>offset_for_partition_" - assert self_mock.row_filter == "a_filter" - - @pytest.mark.asyncio - async def test_bigtable_set_options(self, bt_imports): - self_mock = MagicMock() - bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt_imports.column_family = MagicMock(return_value=MagicMock()) - name_lambda = lambda x: print(x) # noqa - - def to_bt_key(key): - len_total = len(key) - len_prefix = 4 - len_num_bytes_len = key[len_prefix] // 2 - len_first_id = key[len_prefix + len_num_bytes_len] // 2 - len_second_id = ( - key[len_prefix + 1 + len_num_bytes_len + len_first_id + 1] // 2 - ) - key_prefix = key[len_total - len_second_id :] - return key_prefix + key - - def from_bt_key(key): - return key[key.find(b"\x00\x00\x00") :] - - options = { - BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, - BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", - } - BigTableStore._set_options(self_mock, options) - assert self_mock.offset_key_prefix == "offset_test" - assert self_mock.row_filter == "a_filter" - assert self_mock.table_name_generator == name_lambda - - @pytest.mark.asyncio - async def test_setup_bigtable(self, bt_imports): - self_mock = MagicMock() - - faust_table_mock = MagicMock() - faust_table_mock.name = MagicMock(return_value="ABC") - - def table_name_gen(table): - return table.name[::-1] - - self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) - - client_mock = MagicMock() - instance_mock = MagicMock() - table_mock = MagicMock() - - client_mock.instance = MagicMock(return_value=instance_mock) - instance_mock.table = MagicMock(return_value=table_mock) - table_mock.exists = MagicMock(return_value=True) - table_mock.create = MagicMock() - - bt_imports.Client = MagicMock(return_value=client_mock) - options = {} - options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" - options[BigTableStore.BT_PROJECT_KEY] = "bt_project" - - return_value = BigTableStore._setup_bigtable( - self_mock, faust_table_mock, options - ) - bt_imports.Client.assert_called_once_with( - options[BigTableStore.BT_PROJECT_KEY], admin=True - ) - client_mock.instance.assert_called_once_with( - options[BigTableStore.BT_INSTANCE_KEY] - ) - - instance_mock.table.assert_called_once_with(self_mock.bt_table_name) - table_mock.create.assert_not_called() - assert return_value is None - - # Test with no existing table - self_mock.reset_mock() - self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) - table_mock.exists = MagicMock(return_value=False) - return_value = BigTableStore._setup_bigtable( - self_mock, faust_table_mock, options - ) - instance_mock.table.assert_called_once_with(self_mock.bt_table_name) - table_mock.create.assert_called_once_with( - column_families={"FaustColumnFamily": "a_rule"} - ) - assert return_value is None - - @pytest.fixture() - def store(self, bt_imports): - with patch("faust.stores.bigtable.BT", bt_imports): - options = {} - options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" - options[BigTableStore.BT_PROJECT_KEY] = "bt_project" - options[BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY] = True - store = BigTableStore( - "bigtable://", MagicMock(), MagicMock(), options=options - ) - store.bt_table = BigTableMock() - return store - - def test_bigtable_bigtable_get_on_empty(self, store): - return_value = store._bigtable_get(self.TEST_KEY1) - store.bt_table.read_row.assert_called_once_with( - self.TEST_KEY1, filter_="a_filter" - ) - assert return_value is None - - def test_bigtable_delete(self, store): - row_mock = MagicMock() - row_mock.commit = MagicMock() - row_mock.delete = MagicMock() - store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._set_mutation = MagicMock() - - store._bigtable_del(self.TEST_KEY1, no_key_translation=True) - store._set_mutation.assert_called_once_with( - self.TEST_KEY1, row_mock, None - ) - - def test_bigtable_set(self, store): - row_mock = MagicMock() - row_mock.set_cell = MagicMock() - - store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._set_mutation = MagicMock(return_value=None) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True - ) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True - ) - - store._set_mutation.assert_called_with( - self.TEST_KEY1, row_mock, self.TEST_KEY1 - ) - - def test_get_partition_from_message(self, store): - event_mock = MagicMock() - event_mock.message = MagicMock() - event_mock.message.partition = 69 - current_event_mock = MagicMock(return_value=event_mock) - - store.table.is_global = False - store.table.use_partitioner = False - with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._get_current_partitions() - assert return_value == [69] - - store.table.is_global = True - with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._get_current_partitions() - assert return_value == [None] - - store.table.is_global = False - current_event_mock = MagicMock(return_value=None) - - topic = store.table.changelog_topic_name - store.app.assignor.assigned_actives = MagicMock( - return_value={TP(topic, 420)} - ) - store.app.conf.topic_partitions = 421 - with patch("faust.stores.bigtable.current_event", current_event_mock): - return_value = store._get_current_partitions() - assert return_value == [420] - - def test_get_faust_key(self, store): - key_with_partition = b"\x13_..._THEACTUALKEY" - res = store._remove_partition_prefix_from_bigtable_key( - key_with_partition - ) - assert res == b"THEACTUALKEY" - - def test_get_key_with_partition(self, store): - partition = 19 - res = store._add_partition_prefix_to_key(self.TEST_KEY1, partition) - extracted_partition = store._get_partition_from_bigtable_key(res) - assert extracted_partition == partition - assert ( - store._remove_partition_prefix_from_bigtable_key(res) - == self.TEST_KEY1 - ) - - def test_partitions_for_key(self, store): - store._get_current_partitions = MagicMock(return_value=[19]) - res = list(store._get_possible_bt_keys(self.TEST_KEY1)) - assert res == [store._add_partition_prefix_to_key(self.TEST_KEY1, 19)] - - def test_get_keyerror(self, store): - partition = 19 - store._get_current_partitions = MagicMock(return_value=[partition]) - store._bigtable_get = MagicMock(return_value=None) - with pytest.raises(KeyError): - store[self.TEST_KEY1.decode()] - - def test_get_with_known_partition(self, store): - partition = 19 - store._cache = None - store._get_current_partitions = MagicMock(return_value=[partition]) - # Scenario: Found - store._bigtable_get = MagicMock(return_value=b"a_value") - res = store._get(self.TEST_KEY1) - key_with_partition = store._add_partition_prefix_to_key( - self.TEST_KEY1, partition - ) - store._bigtable_get.assert_called_once_with(self.TEST_KEY1) - assert res == b"a_value" - - # Scenario: Not Found - store._bigtable_get = MagicMock(return_value=None) - res = store._get(self.TEST_KEY1) - store._bigtable_get.assert_called_once_with(self.TEST_KEY1) - assert res is None - - # Scenario: Cache hit on value - store._bigtable_get = MagicMock(return_value=None) - store._cache = {self.TEST_KEY1: b"a_value_from_cache"} - res = store._get(self.TEST_KEY1) - store._bigtable_get.assert_not_called() - res2 = store._get(self.TEST_KEY2) - assert res == b"a_value_from_cache" - store._bigtable_get.assert_called_once_with(self.TEST_KEY2) - assert store._cache[self.TEST_KEY2] is None - assert res2 is None - - # Scenario: Cache hit on None value - store._bigtable_get = MagicMock(return_value=None) - res = store._get(self.TEST_KEY2) - store._bigtable_get.assert_not_called() - assert res is None - - def test_set(self, store): - # Scenario: No cache - store._cache = None - store._bigtable_set = MagicMock() - store._set(self.TEST_KEY1, b"a_value") - store._bigtable_set.assert_called_once_with(self.TEST_KEY1, b"a_value") - - # Scenario: Cache active - store._cache = {} - store._set(self.TEST_KEY1, b"b_value") - assert store._cache[self.TEST_KEY1] == b"b_value" - store._bigtable_set.assert_called_with(self.TEST_KEY1, b"b_value") - - def test_del(self, store): - # Scenario: No cache - store._cache = None - store._bigtable_del = MagicMock() - store._del(self.TEST_KEY1) - store._bigtable_del.assert_called_once_with(self.TEST_KEY1) - - # Scenario: Cache active - store._cache = {} - store._del(self.TEST_KEY1) - assert store._cache[self.TEST_KEY1] is None - store._bigtable_del.assert_called_with(self.TEST_KEY1) - - def test_active_partitions(self, store): - active_topics = [ - TP("a_changelogtopic", 19), - TP("a_different_chaneglogtopic", 19), - ] - store.app.assignor.assigned_actives = MagicMock( - return_value=active_topics - ) - store.app.conf.topic_partitions = 20 - store.table.changelog_topic_name = "a_changelogtopic" - store.table.is_global = False - - # Scenario: No global table - res = store._active_partitions() - all_res = list(res) - assert all_res == [19] - - # Scenario: Global table - store.table.is_global = True - res = store._active_partitions() - all_res = list(res) - assert list(range(store.app.conf.topic_partitions)) == all_res - - def test_iteritems(self, store): - store._active_partitions = MagicMock(return_value=[1, 3]) - store.bt_table.read_rows = MagicMock() - store._mutation_buffer = None - store._cache = {} - - _ = sorted(store._iteritems()) - store.bt_table.read_rows.assert_called_once() - - def test_iteritems_with_mutations(self, store): - store._active_partitions = MagicMock(return_value=[1, 3]) - store._mutation_buffer = { - self.TEST_KEY1: ("doesn't matter", b"a_value"), - self.TEST_KEY2: ("doesn't matter", None), - } - store.bt_table.read_rows = MagicMock( - return_value=[ - MagicMock( - row_key=self.TEST_KEY1, - to_dict=MagicMock( - return_value={"x": [MagicMock(value=b"1")]} - ), - commit=MagicMock(), - ), - MagicMock( - row_key=self.TEST_KEY2, - to_dict=MagicMock( - return_value={"x": [MagicMock(value=b"this is overwritten")]} - ), - commit=MagicMock(), - ), - ] - ) - res = sorted(store._iteritems()) - store.bt_table.read_rows.assert_called_once() - assert res == [(self.TEST_KEY1, b"a_value")] - assert store._cache.get(self.TEST_KEY1) == b"a_value" - assert store._cache.get(self.TEST_KEY2) is None - - def test_iterkeys(self, store): - values = [("K1", "V1"), ("K2", "V2")] - store._iteritems = MagicMock(return_value=values) - all_res = sorted(store._iterkeys()) - assert all_res == ["K1", "K2"] - - def test_itervalues(self, store): - values = [("K1", "V1"), ("K2", "V2")] - store._iteritems = MagicMock(return_value=values) - all_res = sorted(store._itervalues()) - assert all_res == ["V1", "V2"] - - def test_size(self, store): - assert 0 == store._size() - - def test_get_offset_key(self, store): - tp = TP("AAAA", 19) - assert store.get_offset_key(tp)[-2:] == "19" - - def test_set_persisted_offset(self, store): - tp = TP("a_topic", 19) - expected_offset_key = store.get_offset_key(tp).encode() - store._last_flush_time = time.time() - store._bigtable_set = MagicMock() - store.bt_table.mutate_rows = MagicMock() - store._mutation_buffer = None - - store.set_persisted_offset(tp, 123) - store._bigtable_set.called_once_with(expected_offset_key, b"123", no_key_translation=True) - store.bt_table.mutate_rows.assert_not_called() - - store._bigtable_set = MagicMock() - store._mutation_buffer = {} - store._mutation_size = 0 - store.set_persisted_offset(tp, 123) - store._bigtable_set.assert_not_called() - store.bt_table.mutate_rows.assert_not_called() - - store._bigtable_set = MagicMock() - store._mutation_buffer = { - - self.TEST_KEY1: ("doesn't matter", b"a_value"), - self.TEST_KEY2: ("doesn't matter", None), - self.TEST_KEY3: ("doesn't matter", b"c_value"), - } - mutations = [ - r[0] for r in store._mutation_buffer.copy().values() - ] - store._num_mutations = 9999999999999999999999999999999 - store.set_persisted_offset(tp, 123) - store._bigtable_set.called_once_with( - expected_offset_key, - b"123", - no_key_translation=True - ) - store.bt_table.mutate_rows.assert_called_once_with(mutations) - - def test_apply_changelog_batch(self, store): - row_mock = MagicMock() - row_mock.delete = MagicMock() - row_mock.set_cell = MagicMock() - store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._bigtable_del = MagicMock() - store._bigtable_set = MagicMock() - store.set_persisted_offset = MagicMock() - store._cache.submit_mutation = MagicMock() - store._cache.set = MagicMock() - - class TestMessage: - def __init__(self, value, key, tp, offset): - self.value = value - self.key = key - self.tp = tp - self.offset = offset - - class TestEvent: - def __init__(self, message): - self.message = message - - tp = TP("a", 19) - tp2 = TP("b", 19) - messages = [ - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 0)), - TestEvent(TestMessage(None, self.TEST_KEY1, tp, 1)), # Delete - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 3)), # Out of order - TestEvent(TestMessage("b", self.TEST_KEY2, tp2, 4)), - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), - ] - store.apply_changelog_batch(messages, lambda x: x, lambda x: x) - assert store._bigtable_set.call_count == 4 - assert store._bigtable_del.call_count == 1 - assert store.set_persisted_offset.call_count == 2 From 94cbbd18e501b399adf34976153e6966d6daad73 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 8 Nov 2023 14:36:09 +0100 Subject: [PATCH 582/616] removed all tests --- tests/unit/stores/test_bigtable.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index e7cbf39ab..8da07fef4 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -120,3 +120,14 @@ class TestBigTableStore: TEST_KEY4 = b"\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d" TEST_KEY5 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2" TEST_KEY6 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02" + + @pytest.fixture() + def bt_imports(self): + with patch("faust.stores.bigtable.BT") as bt: + bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + bt.column_family.MaxVersionsGCRule = MagicMock( + return_value="a_rule" + ) + bt.RowSet = MagicMock(return_value=RowSetMock()) + yield bt + From 7c72248cf7610325e35662220ce831ebd720f1b7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 10 Nov 2023 07:54:10 +0100 Subject: [PATCH 583/616] further progress with tests --- faust/stores/bigtable.py | 11 +- tests/unit/stores/test_bigtable.py | 418 +++++++++++++++++++++++++++++ 2 files changed, 424 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index bd9798574..a8452a704 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -88,8 +88,7 @@ def __init__( self._setup_bigtable(table, options) self._setup_caches(options) self._setup_mutation_batcher(options) - key_index_size = app.conf.table_key_index_size - self.key_index_size = key_index_size + self.key_index_size = app.conf.table_key_index_size self._key_index = LRUCache(limit=self.key_index_size) except Exception as ex: logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") @@ -128,8 +127,10 @@ def _setup_caches( self._startup_cache_ttl = options.get( BigTableStore.BT_STARTUP_CACHE_TTL_KEY, 30 * 60 ) - if self._startup_cache_ttl == 0: - raise ValueError(f"Invalid {self._startup_cache_ttl=}") + if self._startup_cache_ttl <= 0: + self._startup_cache = None + self._startup_cache_partitions = None + return self._startup_cache: Dict[bytes, bytes] = {} self._startup_cache_partitions = set() @@ -266,7 +267,7 @@ def _set_mutation(self, mutated_row: DirectRow): def _bigtable_get( self, keys: List[bytes] ) -> Tuple[Optional[bytes], Optional[int]]: - rowset = RowSet() + rowset = BT.RowSet() for key in keys: rowset.add_row_key(key) if self._mutation_batcher_enable: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 8da07fef4..7c191d4b5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -131,3 +131,421 @@ def bt_imports(self): bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt + @pytest.mark.asyncio + async def test_bigtable_set_options_default(self, bt_imports): + self_mock = MagicMock() + bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + + BigTableStore._set_options(self_mock, options={}) + assert self_mock.offset_key_prefix == "==>offset_for_partition_" + assert self_mock.row_filter == "a_filter" + + @pytest.mark.asyncio + async def test_bigtable_set_options(self, bt_imports): + self_mock = MagicMock() + bt_imports.CellsColumnLimitFilter = MagicMock(return_value="a_filter") + bt_imports.column_family = MagicMock(return_value=MagicMock()) + name_lambda = lambda x: print(x) # noqa + + def to_bt_key(key): + len_total = len(key) + len_prefix = 4 + len_num_bytes_len = key[len_prefix] // 2 + len_first_id = key[len_prefix + len_num_bytes_len] // 2 + len_second_id = ( + key[len_prefix + 1 + len_num_bytes_len + len_first_id + 1] // 2 + ) + key_prefix = key[len_total - len_second_id :] + return key_prefix + key + + def from_bt_key(key): + return key[key.find(b"\x00\x00\x00") :] + + options = { + BigTableStore.BT_TABLE_NAME_GENERATOR_KEY: name_lambda, + BigTableStore.BT_OFFSET_KEY_PREFIX: "offset_test", + } + BigTableStore._set_options(self_mock, options) + assert self_mock.offset_key_prefix == "offset_test" + assert self_mock.row_filter == "a_filter" + assert self_mock.table_name_generator == name_lambda + + @pytest.mark.asyncio + async def test_bigtable_setup(self, bt_imports): + self_mock = MagicMock() + + faust_table_mock = MagicMock() + faust_table_mock.name = MagicMock(return_value="ABC") + + def table_name_gen(table): + return table.name[::-1] + + self_mock.table_name_generator = table_name_gen + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) + + client_mock = MagicMock() + instance_mock = MagicMock() + table_mock = MagicMock() + + client_mock.instance = MagicMock(return_value=instance_mock) + instance_mock.table = MagicMock(return_value=table_mock) + table_mock.exists = MagicMock(return_value=True) + table_mock.create = MagicMock() + + bt_imports.Client = MagicMock(return_value=client_mock) + options = {} + options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" + options[BigTableStore.BT_PROJECT_KEY] = "bt_project" + + return_value = BigTableStore._setup_bigtable( + self_mock, faust_table_mock, options + ) + bt_imports.Client.assert_called_once_with( + options[BigTableStore.BT_PROJECT_KEY], admin=True + ) + client_mock.instance.assert_called_once_with( + options[BigTableStore.BT_INSTANCE_KEY] + ) + + instance_mock.table.assert_called_once_with(self_mock.bt_table_name) + table_mock.create.assert_not_called() + assert return_value is None + + # Test with no existing table + self_mock.reset_mock() + self_mock.table_name_generator = table_name_gen + self_mock.bt_table_name = self_mock.table_name_generator( + faust_table_mock + ) + table_mock.exists = MagicMock(return_value=False) + return_value = BigTableStore._setup_bigtable( + self_mock, faust_table_mock, options + ) + instance_mock.table.assert_called_once_with(self_mock.bt_table_name) + table_mock.create.assert_called_once_with( + column_families={"FaustColumnFamily": "a_rule"} + ) + assert return_value is None + + @pytest.fixture() + def store(self, bt_imports): + with patch("faust.stores.bigtable.BT", bt_imports): + options = {} + options[BigTableStore.BT_INSTANCE_KEY] = "bt_instance" + options[BigTableStore.BT_PROJECT_KEY] = "bt_project" + store = BigTableStore( + "bigtable://", MagicMock(), MagicMock(), options=options + ) + store.bt_table = BigTableMock() + return store + + def test_bigtable_bigtable_get_on_empty(self, store, bt_imports): + return_value = store._bigtable_get([self.TEST_KEY1]) + store.bt_table.read_rows.assert_called_once() + assert return_value == (None, None) + + def test_bigtable_delete(self, store): + row_mock = MagicMock() + row_mock.commit = MagicMock() + row_mock.delete = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._set_mutation = MagicMock() + + store._bigtable_del(self.TEST_KEY1, no_key_translation=True) + store._set_mutation.assert_called_once_with( + self.TEST_KEY1, row_mock, None + ) + + def test_bigtable_set(self, store): + row_mock = MagicMock() + row_mock.set_cell = MagicMock() + + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._set_mutation = MagicMock(return_value=None) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True + ) + store._bigtable_set( + self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True + ) + + store._set_mutation.assert_called_with( + self.TEST_KEY1, row_mock, self.TEST_KEY1 + ) + + def test_get_partition_from_message(self, store): + event_mock = MagicMock() + event_mock.message = MagicMock() + event_mock.message.partition = 69 + current_event_mock = MagicMock(return_value=event_mock) + + store.table.is_global = False + store.table.use_partitioner = False + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._get_current_partitions() + assert return_value == [69] + + store.table.is_global = True + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._get_current_partitions() + assert return_value == [None] + + store.table.is_global = False + current_event_mock = MagicMock(return_value=None) + + topic = store.table.changelog_topic_name + store.app.assignor.assigned_actives = MagicMock( + return_value={TP(topic, 420)} + ) + store.app.conf.topic_partitions = 421 + with patch("faust.stores.bigtable.current_event", current_event_mock): + return_value = store._get_current_partitions() + assert return_value == [420] + + def test_get_faust_key(self, store): + key_with_partition = b"\x13_..._THEACTUALKEY" + res = store._remove_partition_prefix_from_bigtable_key( + key_with_partition + ) + assert res == b"THEACTUALKEY" + + def test_get_key_with_partition(self, store): + partition = 19 + res = store._add_partition_prefix_to_key(self.TEST_KEY1, partition) + extracted_partition = store._get_partition_from_bigtable_key(res) + assert extracted_partition == partition + assert ( + store._remove_partition_prefix_from_bigtable_key(res) + == self.TEST_KEY1 + ) + + def test_partitions_for_key(self, store): + store._get_current_partitions = MagicMock(return_value=[19]) + res = list(store._get_possible_bt_keys(self.TEST_KEY1)) + assert res == [store._add_partition_prefix_to_key(self.TEST_KEY1, 19)] + + def test_get_keyerror(self, store): + partition = 19 + store._get_current_partitions = MagicMock(return_value=[partition]) + store._bigtable_get = MagicMock(return_value=None) + with pytest.raises(KeyError): + store[self.TEST_KEY1.decode()] + + def test_get_with_known_partition(self, store): + partition = 19 + store._cache = None + store._get_current_partitions = MagicMock(return_value=[partition]) + # Scenario: Found + store._bigtable_get = MagicMock(return_value=b"a_value") + res = store._get(self.TEST_KEY1) + key_with_partition = store._add_partition_prefix_to_key( + self.TEST_KEY1, partition + ) + store._bigtable_get.assert_called_once_with(self.TEST_KEY1) + assert res == b"a_value" + + # Scenario: Not Found + store._bigtable_get = MagicMock(return_value=None) + res = store._get(self.TEST_KEY1) + store._bigtable_get.assert_called_once_with(self.TEST_KEY1) + assert res is None + + # Scenario: Cache hit on value + store._bigtable_get = MagicMock(return_value=None) + store._cache = {self.TEST_KEY1: b"a_value_from_cache"} + res = store._get(self.TEST_KEY1) + store._bigtable_get.assert_not_called() + res2 = store._get(self.TEST_KEY2) + assert res == b"a_value_from_cache" + store._bigtable_get.assert_called_once_with(self.TEST_KEY2) + assert store._cache[self.TEST_KEY2] is None + assert res2 is None + + # Scenario: Cache hit on None value + store._bigtable_get = MagicMock(return_value=None) + res = store._get(self.TEST_KEY2) + store._bigtable_get.assert_not_called() + assert res is None + + def test_set(self, store): + # Scenario: No cache + store._cache = None + store._bigtable_set = MagicMock() + store._set(self.TEST_KEY1, b"a_value") + store._bigtable_set.assert_called_once_with(self.TEST_KEY1, b"a_value") + + # Scenario: Cache active + store._cache = {} + store._set(self.TEST_KEY1, b"b_value") + assert store._cache[self.TEST_KEY1] == b"b_value" + store._bigtable_set.assert_called_with(self.TEST_KEY1, b"b_value") + + def test_del(self, store): + # Scenario: No cache + store._cache = None + store._bigtable_del = MagicMock() + store._del(self.TEST_KEY1) + store._bigtable_del.assert_called_once_with(self.TEST_KEY1) + + # Scenario: Cache active + store._cache = {} + store._del(self.TEST_KEY1) + assert store._cache[self.TEST_KEY1] is None + store._bigtable_del.assert_called_with(self.TEST_KEY1) + + def test_active_partitions(self, store): + active_topics = [ + TP("a_changelogtopic", 19), + TP("a_different_chaneglogtopic", 19), + ] + store.app.assignor.assigned_actives = MagicMock( + return_value=active_topics + ) + store.app.conf.topic_partitions = 20 + store.table.changelog_topic_name = "a_changelogtopic" + store.table.is_global = False + + # Scenario: No global table + res = store._active_partitions() + all_res = list(res) + assert all_res == [19] + + # Scenario: Global table + store.table.is_global = True + res = store._active_partitions() + all_res = list(res) + assert list(range(store.app.conf.topic_partitions)) == all_res + + def test_iteritems(self, store): + store._active_partitions = MagicMock(return_value=[1, 3]) + store.bt_table.read_rows = MagicMock() + store._mutation_buffer = None + store._cache = {} + + _ = sorted(store._iteritems()) + store.bt_table.read_rows.assert_called_once() + + def test_iteritems_with_mutations(self, store): + store._active_partitions = MagicMock(return_value=[1, 3]) + store._mutation_buffer = { + self.TEST_KEY1: ("doesn't matter", b"a_value"), + self.TEST_KEY2: ("doesn't matter", None), + } + store.bt_table.read_rows = MagicMock( + return_value=[ + MagicMock( + row_key=self.TEST_KEY1, + to_dict=MagicMock( + return_value={"x": [MagicMock(value=b"1")]} + ), + commit=MagicMock(), + ), + MagicMock( + row_key=self.TEST_KEY2, + to_dict=MagicMock( + return_value={"x": [MagicMock(value=b"this is overwritten")]} + ), + commit=MagicMock(), + ), + ] + ) + res = sorted(store._iteritems()) + store.bt_table.read_rows.assert_called_once() + assert res == [(self.TEST_KEY1, b"a_value")] + assert store._cache.get(self.TEST_KEY1) == b"a_value" + assert store._cache.get(self.TEST_KEY2) is None + + def test_iterkeys(self, store): + values = [("K1", "V1"), ("K2", "V2")] + store._iteritems = MagicMock(return_value=values) + all_res = sorted(store._iterkeys()) + assert all_res == ["K1", "K2"] + + def test_itervalues(self, store): + values = [("K1", "V1"), ("K2", "V2")] + store._iteritems = MagicMock(return_value=values) + all_res = sorted(store._itervalues()) + assert all_res == ["V1", "V2"] + + def test_size(self, store): + assert 0 == store._size() + + def test_get_offset_key(self, store): + tp = TP("AAAA", 19) + assert store.get_offset_key(tp)[-2:] == "19" + + def test_set_persisted_offset(self, store): + tp = TP("a_topic", 19) + expected_offset_key = store.get_offset_key(tp).encode() + store._last_flush_time = time.time() + store._bigtable_set = MagicMock() + store.bt_table.mutate_rows = MagicMock() + store._mutation_buffer = None + + store.set_persisted_offset(tp, 123) + store._bigtable_set.called_once_with(expected_offset_key, b"123", no_key_translation=True) + store.bt_table.mutate_rows.assert_not_called() + + store._bigtable_set = MagicMock() + store._mutation_buffer = {} + store._mutation_size = 0 + store.set_persisted_offset(tp, 123) + store._bigtable_set.assert_not_called() + store.bt_table.mutate_rows.assert_not_called() + + store._bigtable_set = MagicMock() + store._mutation_buffer = { + + self.TEST_KEY1: ("doesn't matter", b"a_value"), + self.TEST_KEY2: ("doesn't matter", None), + self.TEST_KEY3: ("doesn't matter", b"c_value"), + } + mutations = [ + r[0] for r in store._mutation_buffer.copy().values() + ] + store._num_mutations = 9999999999999999999999999999999 + store.set_persisted_offset(tp, 123) + store._bigtable_set.called_once_with( + expected_offset_key, + b"123", + no_key_translation=True + ) + store.bt_table.mutate_rows.assert_called_once_with(mutations) + + def test_apply_changelog_batch(self, store): + row_mock = MagicMock() + row_mock.delete = MagicMock() + row_mock.set_cell = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._bigtable_del = MagicMock() + store._bigtable_set = MagicMock() + store.set_persisted_offset = MagicMock() + store._cache.submit_mutation = MagicMock() + store._cache.set = MagicMock() + + class TestMessage: + def __init__(self, value, key, tp, offset): + self.value = value + self.key = key + self.tp = tp + self.offset = offset + + class TestEvent: + def __init__(self, message): + self.message = message + + tp = TP("a", 19) + tp2 = TP("b", 19) + messages = [ + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 0)), + TestEvent(TestMessage(None, self.TEST_KEY1, tp, 1)), # Delete + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 3)), # Out of order + TestEvent(TestMessage("b", self.TEST_KEY2, tp2, 4)), + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), + ] + store.apply_changelog_batch(messages, lambda x: x, lambda x: x) + assert store._bigtable_set.call_count == 4 + assert store._bigtable_del.call_count == 1 + assert store.set_persisted_offset.call_count == 2 From 9b241bdcb7f59dd0ffe97509b1e73b14c02332a7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 10 Nov 2023 09:28:37 +0100 Subject: [PATCH 584/616] fixed bug in delete where delete was called with a wrong key --- faust/stores/bigtable.py | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index a8452a704..b2d51aa7b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -71,9 +71,7 @@ class BigTableStore(base.SerializedStore): BT_STARTUP_CACHE_TTL_KEY = "bt_startup_cache_ttl_key" BT_MUTATION_BATCHER_ENABLE_KEY = "bt_mutation_batcher_enable_key" BT_MUTATION_BATCHER_FLUSH_COUNT_KEY = "bt_mutation_batcher_flush_count_key" - BT_MUTATION_BATCHER_FLUSH_INTERVAL_KEY = ( - "bt_mutation_batcher_flush_interval_key" - ) + BT_MUTATION_BATCHER_FLUSH_INTERVAL_KEY = "bt_mutation_batcher_flush_interval_key" def __init__( self, @@ -254,9 +252,7 @@ def _invalidate_startup_cache(self): self._startup_cache = None self._startup_cache_partitions = None gc.collect() - self.log.info( - f"Invalidated startup cache for table {self.table_name}" - ) + self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() del self._invalidation_timer self._invalidation_timer = None @@ -264,9 +260,7 @@ def _invalidate_startup_cache(self): def _set_mutation(self, mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) - def _bigtable_get( - self, keys: List[bytes] - ) -> Tuple[Optional[bytes], Optional[int]]: + def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: rowset = BT.RowSet() for key in keys: rowset.add_row_key(key) @@ -288,9 +282,7 @@ def _get(self, key: bytes) -> Optional[bytes]: partitions = self._get_partitions_for_key(key) - keys = [ - self._add_partition_prefix_to_key(key, p) for p in partitions - ] + keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition @@ -346,8 +338,8 @@ def _del(self, key: bytes) -> None: self._del_cache(key) partitions = self._get_partitions_for_key(key) for partition in partitions: - key = self._add_partition_prefix_to_key(key, partition) - self._bigtable_del(key) + key_with_partition = self._add_partition_prefix_to_key(key, partition) + self._bigtable_del(key_with_partition) except Exception as ex: self.log.error( f"FaustBigtableException Error in del for " @@ -370,9 +362,7 @@ def _bigtable_iteritems(self, partitions): need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: for partition in partitions: - prefix = self._add_partition_prefix_to_key( - b"", partition - ).decode() + prefix = self._add_partition_prefix_to_key(b"", partition).decode() row_set.add_row_range_with_prefix(prefix) if self._mutation_batcher_enable: @@ -387,9 +377,7 @@ def _bigtable_iteritems(self, partitions): continue value = self.bigtable_exrtact_row_data(row) - key = self._remove_partition_prefix_from_bigtable_key( - row.row_key - ) + key = self._remove_partition_prefix_from_bigtable_key(row.row_key) yield key, value end = time.time() self.log.info( @@ -472,9 +460,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: if self._mutation_batcher_enable: self._mutation_batcher.flush() row = self.bt_table.read_row(offset_key, filter_=self.row_filter) - offset = ( - self.bigtable_exrtact_row_data(row) if row is not None else None - ) + offset = self.bigtable_exrtact_row_data(row) if row is not None else None return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: @@ -526,7 +512,6 @@ def apply_changelog_batch( self._set_cache(msg.key, msg.value) self._bigtable_set(bt_key, msg.value) - for tp, offset in tp_offsets.items(): self.set_persisted_offset(tp, offset) From d7a6b9d9628868a5cc8940fefa63da6953eb816e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 10 Nov 2023 10:19:17 +0100 Subject: [PATCH 585/616] fixed unit tests for bigtable --- tests/unit/stores/test_bigtable.py | 273 ++++++++++++++--------------- 1 file changed, 132 insertions(+), 141 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 7c191d4b5..2e0769d0b 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -47,9 +47,7 @@ class RowSetMock: def __init__(self) -> None: self.keys = set() self.add_row_key = MagicMock(wraps=self._add_row_key) - self.add_row_range_from_keys = MagicMock( - wraps=self._add_row_range_from_keys - ) + self.add_row_range_from_keys = MagicMock(wraps=self._add_row_range_from_keys) def _add_row_key(self, key): self.keys.add(key) @@ -118,16 +116,16 @@ class TestBigTableStore: TEST_KEY2 = b"TEST_KEY2" TEST_KEY3 = b"TEST_KEY3" TEST_KEY4 = b"\x00\x00\x00\x00\x01\x0eNoGroup\x00063d76e3ebd7e634de234c67d" - TEST_KEY5 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2" + TEST_KEY5 = ( + b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x00062a99788df917508d1891ed2" + ) TEST_KEY6 = b"\x00\x00\x00\x00\x02062a99788df917508d1891ed2\x02" @pytest.fixture() def bt_imports(self): with patch("faust.stores.bigtable.BT") as bt: bt.CellsColumnLimitFilter = MagicMock(return_value="a_filter") - bt.column_family.MaxVersionsGCRule = MagicMock( - return_value="a_rule" - ) + bt.column_family.MaxVersionsGCRule = MagicMock(return_value="a_rule") bt.RowSet = MagicMock(return_value=RowSetMock()) yield bt @@ -181,9 +179,7 @@ def table_name_gen(table): return table.name[::-1] self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) + self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) client_mock = MagicMock() instance_mock = MagicMock() @@ -216,9 +212,7 @@ def table_name_gen(table): # Test with no existing table self_mock.reset_mock() self_mock.table_name_generator = table_name_gen - self_mock.bt_table_name = self_mock.table_name_generator( - faust_table_mock - ) + self_mock.bt_table_name = self_mock.table_name_generator(faust_table_mock) table_mock.exists = MagicMock(return_value=False) return_value = BigTableStore._setup_bigtable( self_mock, faust_table_mock, options @@ -244,7 +238,7 @@ def store(self, bt_imports): def test_bigtable_bigtable_get_on_empty(self, store, bt_imports): return_value = store._bigtable_get([self.TEST_KEY1]) store.bt_table.read_rows.assert_called_once() - assert return_value == (None, None) + assert return_value == (None, None) def test_bigtable_delete(self, store): row_mock = MagicMock() @@ -253,27 +247,36 @@ def test_bigtable_delete(self, store): store.bt_table.direct_row = MagicMock(return_value=row_mock) store._set_mutation = MagicMock() - store._bigtable_del(self.TEST_KEY1, no_key_translation=True) - store._set_mutation.assert_called_once_with( - self.TEST_KEY1, row_mock, None - ) + store._bigtable_del(self.TEST_KEY1) + store._set_mutation.assert_not_called() + row_mock.delete.assert_called_once() + row_mock.commit.assert_called_once() + + # Test with mutation buffer + store._mutation_batcher_enable = True + store._bigtable_del(self.TEST_KEY1) + store._set_mutation.assert_called_once_with(row_mock) + assert row_mock.delete.call_count == 2 + assert row_mock.commit.call_count == 1 def test_bigtable_set(self, store): row_mock = MagicMock() row_mock.set_cell = MagicMock() - + row_mock.commit = MagicMock() store.bt_table.direct_row = MagicMock(return_value=row_mock) - store._set_mutation = MagicMock(return_value=None) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True - ) - store._bigtable_set( - self.TEST_KEY1, self.TEST_KEY1, no_key_translation=True - ) + store._set_mutation = MagicMock() - store._set_mutation.assert_called_with( - self.TEST_KEY1, row_mock, self.TEST_KEY1 - ) + store._bigtable_set(self.TEST_KEY1, b"a_value") + store._set_mutation.assert_not_called() + row_mock.set_cell.assert_called_once() + row_mock.commit.assert_called_once() + + # Test with mutation buffer + store._mutation_batcher_enable = True + store._bigtable_set(self.TEST_KEY1, "a_value") + store._set_mutation.assert_called_once_with(row_mock) + assert row_mock.set_cell.call_count == 2 + assert row_mock.commit.call_count == 1 def test_get_partition_from_message(self, store): event_mock = MagicMock() @@ -283,6 +286,11 @@ def test_get_partition_from_message(self, store): store.table.is_global = False store.table.use_partitioner = False + topic = store.table.changelog_topic_name + store.app.assignor.assigned_actives = MagicMock( + return_value={TP(topic, 123), TP(topic, 69)} + ) + store.app.conf.topic_partitions = 123 with patch("faust.stores.bigtable.current_event", current_event_mock): return_value = store._get_current_partitions() assert return_value == [69] @@ -290,25 +298,18 @@ def test_get_partition_from_message(self, store): store.table.is_global = True with patch("faust.stores.bigtable.current_event", current_event_mock): return_value = store._get_current_partitions() - assert return_value == [None] + assert return_value == list(range(123)) store.table.is_global = False current_event_mock = MagicMock(return_value=None) - topic = store.table.changelog_topic_name - store.app.assignor.assigned_actives = MagicMock( - return_value={TP(topic, 420)} - ) - store.app.conf.topic_partitions = 421 with patch("faust.stores.bigtable.current_event", current_event_mock): return_value = store._get_current_partitions() - assert return_value == [420] + assert return_value == [69] def test_get_faust_key(self, store): key_with_partition = b"\x13_..._THEACTUALKEY" - res = store._remove_partition_prefix_from_bigtable_key( - key_with_partition - ) + res = store._remove_partition_prefix_from_bigtable_key(key_with_partition) assert res == b"THEACTUALKEY" def test_get_key_with_partition(self, store): @@ -316,54 +317,53 @@ def test_get_key_with_partition(self, store): res = store._add_partition_prefix_to_key(self.TEST_KEY1, partition) extracted_partition = store._get_partition_from_bigtable_key(res) assert extracted_partition == partition - assert ( - store._remove_partition_prefix_from_bigtable_key(res) - == self.TEST_KEY1 - ) + assert store._remove_partition_prefix_from_bigtable_key(res) == self.TEST_KEY1 def test_partitions_for_key(self, store): store._get_current_partitions = MagicMock(return_value=[19]) - res = list(store._get_possible_bt_keys(self.TEST_KEY1)) - assert res == [store._add_partition_prefix_to_key(self.TEST_KEY1, 19)] + res = list(store._get_partitions_for_key(self.TEST_KEY1)) + assert res == [19] def test_get_keyerror(self, store): - partition = 19 + partition = None store._get_current_partitions = MagicMock(return_value=[partition]) - store._bigtable_get = MagicMock(return_value=None) + store._bigtable_get = MagicMock(return_value=(None, None)) with pytest.raises(KeyError): - store[self.TEST_KEY1.decode()] + key = "123" + store[key] def test_get_with_known_partition(self, store): - partition = 19 - store._cache = None - store._get_current_partitions = MagicMock(return_value=[partition]) + partitions = [19, 20] + store._get_cache = MagicMock(return_value=(b"this is ignored", False)) + store._key_index = {} + store._get_current_partitions = MagicMock(return_value=partitions) # Scenario: Found - store._bigtable_get = MagicMock(return_value=b"a_value") + store._bigtable_get = MagicMock(return_value=(b"a_value", 19)) + res = store._get(self.TEST_KEY1) - key_with_partition = store._add_partition_prefix_to_key( - self.TEST_KEY1, partition - ) - store._bigtable_get.assert_called_once_with(self.TEST_KEY1) + get_keys = [ + store._add_partition_prefix_to_key(self.TEST_KEY1, p) for p in partitions + ] + store._bigtable_get.assert_called_once_with(get_keys) assert res == b"a_value" # Scenario: Not Found - store._bigtable_get = MagicMock(return_value=None) + store._bigtable_get = MagicMock(return_value=(None, None)) res = store._get(self.TEST_KEY1) - store._bigtable_get.assert_called_once_with(self.TEST_KEY1) + store._bigtable_get.assert_called_with( + [get_keys[0]] + ) # because the partition is in key_index assert res is None # Scenario: Cache hit on value + store._get_cache = MagicMock(return_value=(b"a_value_from_cache", True)) store._bigtable_get = MagicMock(return_value=None) - store._cache = {self.TEST_KEY1: b"a_value_from_cache"} res = store._get(self.TEST_KEY1) store._bigtable_get.assert_not_called() - res2 = store._get(self.TEST_KEY2) assert res == b"a_value_from_cache" - store._bigtable_get.assert_called_once_with(self.TEST_KEY2) - assert store._cache[self.TEST_KEY2] is None - assert res2 is None # Scenario: Cache hit on None value + store._get_cache = MagicMock(return_value=(None, True)) store._bigtable_get = MagicMock(return_value=None) res = store._get(self.TEST_KEY2) store._bigtable_get.assert_not_called() @@ -371,38 +371,49 @@ def test_get_with_known_partition(self, store): def test_set(self, store): # Scenario: No cache - store._cache = None - store._bigtable_set = MagicMock() - store._set(self.TEST_KEY1, b"a_value") - store._bigtable_set.assert_called_once_with(self.TEST_KEY1, b"a_value") + event_mock = MagicMock() + event_mock.message = MagicMock() + event_mock.message.partition = 69 + current_event_mock = MagicMock(return_value=event_mock) + no_event_mock = MagicMock(return_value=None) - # Scenario: Cache active - store._cache = {} - store._set(self.TEST_KEY1, b"b_value") - assert store._cache[self.TEST_KEY1] == b"b_value" - store._bigtable_set.assert_called_with(self.TEST_KEY1, b"b_value") + # Test assertion withour current event + with patch("faust.stores.bigtable.current_event", no_event_mock): + with pytest.raises(AssertionError): + store["123"] = "000" + + with patch("faust.stores.bigtable.current_event", current_event_mock): + store._key_index = {} + store._set_cache = MagicMock() + store._bigtable_set = MagicMock() + store._set(self.TEST_KEY1, b"a_value") + + key = store._add_partition_prefix_to_key(self.TEST_KEY1, 69) + store._set_cache.assert_called_with(self.TEST_KEY1, b"a_value") + store._bigtable_set.assert_called_once_with(key, b"a_value") def test_del(self, store): # Scenario: No cache - store._cache = None store._bigtable_del = MagicMock() + store._del_cache = MagicMock(return_value=None) + store._get_partitions_for_key = MagicMock(return_value=[1, 2, 3]) store._del(self.TEST_KEY1) - store._bigtable_del.assert_called_once_with(self.TEST_KEY1) - - # Scenario: Cache active - store._cache = {} - store._del(self.TEST_KEY1) - assert store._cache[self.TEST_KEY1] is None - store._bigtable_del.assert_called_with(self.TEST_KEY1) + # Check one call for each partition + keys = [ + store._add_partition_prefix_to_key(self.TEST_KEY1, p) for p in [1, 2, 3] + ] + store._del_cache.assert_called_once_with(self.TEST_KEY1) + assert store._bigtable_del.call_count == 3 + expected_calls = [call(key) for key in keys] + for call_args in store._bigtable_del.call_args_list: + assert call_args in expected_calls def test_active_partitions(self, store): active_topics = [ TP("a_changelogtopic", 19), TP("a_different_chaneglogtopic", 19), ] - store.app.assignor.assigned_actives = MagicMock( - return_value=active_topics - ) + store.app.assignor.assigned_actives = MagicMock(return_value=active_topics) store.app.conf.topic_partitions = 20 store.table.changelog_topic_name = "a_changelogtopic" store.table.is_global = False @@ -420,42 +431,51 @@ def test_active_partitions(self, store): def test_iteritems(self, store): store._active_partitions = MagicMock(return_value=[1, 3]) + store._bigtable_iteritems = MagicMock(wraps=store._bigtable_iteritems) store.bt_table.read_rows = MagicMock() - store._mutation_buffer = None - store._cache = {} - _ = sorted(store._iteritems()) store.bt_table.read_rows.assert_called_once() + # Calling with None means get all rows + store._bigtable_iteritems.assert_called_once_with(None) - def test_iteritems_with_mutations(self, store): + def test_iteritems_with_startup_cache(self, store, bt_imports): store._active_partitions = MagicMock(return_value=[1, 3]) - store._mutation_buffer = { - self.TEST_KEY1: ("doesn't matter", b"a_value"), - self.TEST_KEY2: ("doesn't matter", None), + store._startup_cache = { + self.TEST_KEY1: b"this is a value", + self.TEST_KEY2: b"this is another value", + b"Dont return this": None, } + store._startup_cache_partitions = [1] + + store._bigtable_iteritems = MagicMock(wraps=store._bigtable_iteritems) store.bt_table.read_rows = MagicMock( return_value=[ MagicMock( - row_key=self.TEST_KEY1, - to_dict=MagicMock( - return_value={"x": [MagicMock(value=b"1")]} - ), + row_key=store._add_partition_prefix_to_key(self.TEST_KEY3, 3), + to_dict=MagicMock(return_value={"x": [MagicMock(value=b"1")]}), commit=MagicMock(), ), MagicMock( - row_key=self.TEST_KEY2, - to_dict=MagicMock( - return_value={"x": [MagicMock(value=b"this is overwritten")]} - ), + row_key=store._add_partition_prefix_to_key(self.TEST_KEY4, 3), + to_dict=MagicMock(return_value={"x": [MagicMock(value=b"2")]}), commit=MagicMock(), ), ] ) res = sorted(store._iteritems()) - store.bt_table.read_rows.assert_called_once() - assert res == [(self.TEST_KEY1, b"a_value")] - assert store._cache.get(self.TEST_KEY1) == b"a_value" - assert store._cache.get(self.TEST_KEY2) is None + store._bigtable_iteritems.assert_called_once_with({3}) + all_entries = { + self.TEST_KEY1: b"this is a value", + self.TEST_KEY2: b"this is another value", + self.TEST_KEY3: b"1", + self.TEST_KEY4: b"2", + } + assert res == sorted(list(all_entries.items())) + keys = list(sorted(store._iterkeys())) + values = list(sorted(store._itervalues())) + + assert keys == sorted(list(all_entries.keys())) + assert values == sorted(list(all_entries.values())) def test_iterkeys(self, store): values = [("K1", "V1"), ("K2", "V2")] @@ -479,40 +499,10 @@ def test_get_offset_key(self, store): def test_set_persisted_offset(self, store): tp = TP("a_topic", 19) expected_offset_key = store.get_offset_key(tp).encode() - store._last_flush_time = time.time() - store._bigtable_set = MagicMock() - store.bt_table.mutate_rows = MagicMock() - store._mutation_buffer = None - - store.set_persisted_offset(tp, 123) - store._bigtable_set.called_once_with(expected_offset_key, b"123", no_key_translation=True) - store.bt_table.mutate_rows.assert_not_called() - - store._bigtable_set = MagicMock() - store._mutation_buffer = {} - store._mutation_size = 0 - store.set_persisted_offset(tp, 123) - store._bigtable_set.assert_not_called() - store.bt_table.mutate_rows.assert_not_called() - store._bigtable_set = MagicMock() - store._mutation_buffer = { - self.TEST_KEY1: ("doesn't matter", b"a_value"), - self.TEST_KEY2: ("doesn't matter", None), - self.TEST_KEY3: ("doesn't matter", b"c_value"), - } - mutations = [ - r[0] for r in store._mutation_buffer.copy().values() - ] - store._num_mutations = 9999999999999999999999999999999 store.set_persisted_offset(tp, 123) - store._bigtable_set.called_once_with( - expected_offset_key, - b"123", - no_key_translation=True - ) - store.bt_table.mutate_rows.assert_called_once_with(mutations) + store._bigtable_set.called_once_with(expected_offset_key, b"123") def test_apply_changelog_batch(self, store): row_mock = MagicMock() @@ -522,15 +512,16 @@ def test_apply_changelog_batch(self, store): store._bigtable_del = MagicMock() store._bigtable_set = MagicMock() store.set_persisted_offset = MagicMock() - store._cache.submit_mutation = MagicMock() - store._cache.set = MagicMock() + store._set_cache = MagicMock() + store._del_cache = MagicMock() class TestMessage: - def __init__(self, value, key, tp, offset): + def __init__(self, value, key, tp, offset, partition): self.value = value self.key = key self.tp = tp self.offset = offset + self.partition = partition class TestEvent: def __init__(self, message): @@ -539,11 +530,11 @@ def __init__(self, message): tp = TP("a", 19) tp2 = TP("b", 19) messages = [ - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 0)), - TestEvent(TestMessage(None, self.TEST_KEY1, tp, 1)), # Delete - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 3)), # Out of order - TestEvent(TestMessage("b", self.TEST_KEY2, tp2, 4)), - TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2)), + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 0, 1)), + TestEvent(TestMessage(None, self.TEST_KEY1, tp, 1, 1)), # Delete + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 3, 1)), # Out of order + TestEvent(TestMessage("b", self.TEST_KEY2, tp2, 4, 2)), + TestEvent(TestMessage("a", self.TEST_KEY1, tp, 2, 1)), ] store.apply_changelog_batch(messages, lambda x: x, lambda x: x) assert store._bigtable_set.call_count == 4 From 6cdd356a872ad23c81d8e3024d97f25052910689 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 10 Nov 2023 12:33:38 +0100 Subject: [PATCH 586/616] added additional tests and fixed stuff in the implementation current coverage should be around 82% now --- faust/stores/bigtable.py | 33 +++---- tests/unit/stores/test_bigtable.py | 143 +++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 18 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index b2d51aa7b..1252be66b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -249,8 +249,7 @@ def _get_cache(self, key: bytes): def _invalidate_startup_cache(self): if self._startup_cache is not None: self._startup_cache.clear() - self._startup_cache = None - self._startup_cache_partitions = None + self._startup_cache_partitions = set() gc.collect() self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() @@ -543,28 +542,23 @@ def _fill_caches(self, partitions): for k, v in self._bigtable_iteritems(partitions=partitions): self._set_cache(k, v) - if self._startup_cache_partitions is not None: - self._startup_cache_partitions = self._startup_cache_partitions.union( - partitions - ) + self._startup_cache_partitions = self._startup_cache_partitions.union( + partitions + ) # Invalidate startup cache after 30 minutes # or reset the timer if already running - if self._startup_cache is not None: - if self._invalidation_timer is not None: - self._invalidation_timer.cancel() - del self._invalidation_timer - self._invalidation_timer = None - self._invalidation_timer = threading.Timer( - self._startup_cache_ttl, self._invalidate_startup_cache - ) - self._invalidation_timer.start() + if self._invalidation_timer is not None: + self._invalidation_timer.cancel() + del self._invalidation_timer + self._invalidation_timer = None + self._invalidation_timer = threading.Timer( + self._startup_cache_ttl, self._invalidate_startup_cache + ) + self._invalidation_timer.start() def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] ) -> Set[int]: - if self._startup_cache is None: - return set() - partitions = set() standby_tps = self.app.assignor.assigned_standbys() my_topics = table.changelog_topic.topics @@ -577,6 +571,9 @@ async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: # Fill cache with all keys for the partitions we are assigned + if self._startup_cache_enable is False: + return + partitions = self._get_active_changelogtopic_partitions(table, tps) if len(partitions) == 0: return diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2e0769d0b..708ca0d84 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -540,3 +540,146 @@ def __init__(self, message): assert store._bigtable_set.call_count == 4 assert store._bigtable_del.call_count == 1 assert store.set_persisted_offset.call_count == 2 + + @pytest.mark.asyncio + async def test_fill_caches(self, store, bt_imports): + store._bigtable_iteritems = MagicMock( + return_value=[(b"key1", b"value1"), (b"key2", b"value2")] + ) + store._set_cache = MagicMock() + store._startup_cache_ttl = 1800 + store._invalidation_timer = None + store._startup_cache_partitions = set() + store._startup_cache = {} + + partitions = {TP("topic", 0), TP("topic", 1)} + partitions2 = {TP("topic", 0), TP("topic", 2)} + + store._fill_caches(partitions) + + assert store._bigtable_iteritems.call_args == call(partitions=partitions) + assert store._set_cache.call_args_list == [ + call(b"key1", b"value1"), + call(b"key2", b"value2"), + ] + assert store._startup_cache_partitions == partitions + assert store._invalidation_timer is not None + assert store._invalidation_timer.is_alive() + + # Test with different partitions + # This should reset the _invalidation_timer + old_invalid_timer = store._invalidation_timer.__hash__() + + store._bigtable_iteritems = MagicMock( + return_value=[(b"key3", b"value3"), (b"key4", b"value4")] + ) + store._set_cache = MagicMock() + store._fill_caches(partitions2) + new_invalid_timer = store._invalidation_timer.__hash__() + # Check if old invalidation timer is different from new one + assert old_invalid_timer != new_invalid_timer + assert store._invalidation_timer is not None + assert store._invalidation_timer.is_alive() + + assert store._bigtable_iteritems.call_args == call(partitions=partitions2) + assert store._set_cache.call_args_list == [ + call(b"key3", b"value3"), + call(b"key4", b"value4"), + ] + assert store._startup_cache_partitions == partitions | partitions2 + assert store._invalidation_timer is not None + assert store._invalidation_timer.is_alive() + + # Wait for the invalidation timer to expire + store._invalidation_timer.cancel() + store._invalidate_startup_cache() + + assert store._startup_cache == {} + assert store._startup_cache_partitions == set() + assert store._invalidation_timer is None + + @pytest.mark.asyncio + async def test__get_active_changelogtopic_partitions(self, store): + tps_table = { + "changelog_topic", + "other_topic", + "other_topic2", + } + store.table = MagicMock(changelog_topic=MagicMock(topics=tps_table)) + + tps = {TP("changelog_topic", 0), TP("other_topic", 1)} + active_partitions = store._get_active_changelogtopic_partitions( + store.table, tps + ) + assert active_partitions == {0, 1} + + @pytest.mark.asyncio + async def test_bigtable_on_rebalance(self, store, bt_imports): + store.assign_partitions = MagicMock(wraps=store.assign_partitions) + tps_table = { + "topic1", + "topic2", + "topic3", + "topic4", + "topic5", + } + store.table = MagicMock(changelog_topic=MagicMock(topics=tps_table)) + + store._fill_caches = MagicMock() + assigned = {TP("topic1", 0), TP("topic2", 1)} + revoked = {TP("topic3", 2)} + newly_assigned = {TP("topic4", 3), TP("topic5", 4)} + store._startup_cache_enable = False + await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=1) + store.assign_partitions.assert_called_once_with(store.table, newly_assigned, 1) + store._fill_caches.assert_not_called() + newly_assigned = set() + + # Test with empty newly_assigned + store._startup_cache_enable = True + await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=2) + store.assign_partitions.assert_called_with(store.table, newly_assigned, 2) + store._fill_caches.assert_not_called() + + store._startup_cache_enable = True + newly_assigned = {TP("topic4", 3), TP("topic5", 4)} + await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) + store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) + store._fill_caches.assert_called_once_with({3, 4}) + + def test_contains(self, store, bt_imports): + store._get = MagicMock(return_value=b"test_value") + + # Test that _contains returns True when store_check_exists is False + store.app.conf.store_check_exists = False + assert store._contains(b"test_key") is True + + # Test that _contains returns True when _get returns a value + store.app.conf.store_check_exists = True + assert store._contains(b"test_key") is True + + # Test that _contains returns False when _get returns None + store._get = MagicMock(return_value=None) + assert store._contains(b"test_key") is False + + def test_setup_caches_startup_cache_enable(self, store): + options = { + BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY: True, + BigTableStore.BT_STARTUP_CACHE_TTL_KEY: 60, + } + store._setup_caches(options=options) + assert store._startup_cache_enable is True + assert store._startup_cache_ttl == 60 + assert isinstance(store._startup_cache, dict) + assert isinstance(store._startup_cache_partitions, set) + assert store._invalidation_timer is None + + def test_setup_caches_startup_cache_disable(self, store): + options = { + BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY: False, + } + store._setup_caches(options=options) + assert store._startup_cache_enable is False + assert not hasattr(store, "_startup_cache_ttl") + assert store._startup_cache is None + assert store._startup_cache_partitions is None From 54d3868868ae1eb4e1d04736bd5bb9cf79a20246 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 13 Nov 2023 09:57:25 +0100 Subject: [PATCH 587/616] faster get requests with abort for startup cache values --- faust/stores/bigtable.py | 35 +++++++++++++++--------------- tests/unit/stores/test_bigtable.py | 17 +++++++++++---- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1252be66b..1faf72bd8 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -121,21 +121,18 @@ def _setup_caches( self._startup_cache_enable = options.get( BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY, False ) + + self._startup_cache = None + self._startup_cache_partitions: Set[int] = set() + self._startup_cache_ttl = options.get( + BigTableStore.BT_STARTUP_CACHE_TTL_KEY, 30 * 60 + ) if self._startup_cache_enable: - self._startup_cache_ttl = options.get( - BigTableStore.BT_STARTUP_CACHE_TTL_KEY, 30 * 60 - ) if self._startup_cache_ttl <= 0: - self._startup_cache = None - self._startup_cache_partitions = None return self._startup_cache: Dict[bytes, bytes] = {} - self._startup_cache_partitions = set() self._invalidation_timer: Optional[threading.Timer] = None - else: - self._startup_cache_partitions = None - self._startup_cache = None def _set_options(self, options) -> None: self._all_options = options @@ -241,7 +238,7 @@ def _set_cache(self, key: bytes, value): self._startup_cache[key] = value def _get_cache(self, key: bytes): - if self._startup_cache is not None: + if self._startup_cache_enable and self._startup_cache is not None: if key in self._startup_cache: return self._startup_cache[key], True return None, False @@ -279,9 +276,16 @@ def _get(self, key: bytes) -> Optional[bytes]: if found: return value - partitions = self._get_partitions_for_key(key) + partitions = set(self._get_partitions_for_key(key)) + # Remove partitions that we already have in cache + partitions.difference_update(self._startup_cache_partitions) + # Nothing todo + if len(partitions) == 0: + return None keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] + if len(keys) == 0: + return None value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition @@ -396,12 +400,11 @@ def _iteritems( ) -> Iterator[Tuple[bytes, bytes]]: if self._startup_cache is not None: if partitions is None: - partitions = self._active_partitions() + partitions = set(self._active_partitions()) for k, v in self._startup_cache.items(): if v is not None: yield k, v - partitions = set(partitions) - partitions = partitions.difference(self._startup_cache_partitions) + partitions.difference_update(self._startup_cache_partitions) if partitions is None or len(partitions) > 0: yield from self._bigtable_iteritems(partitions) @@ -542,9 +545,7 @@ def _fill_caches(self, partitions): for k, v in self._bigtable_iteritems(partitions=partitions): self._set_cache(k, v) - self._startup_cache_partitions = self._startup_cache_partitions.union( - partitions - ) + self._startup_cache_partitions |= set(partitions) # Invalidate startup cache after 30 minutes # or reset the timer if already running if self._invalidation_timer is not None: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 708ca0d84..654f2a920 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -357,14 +357,22 @@ def test_get_with_known_partition(self, store): # Scenario: Cache hit on value store._get_cache = MagicMock(return_value=(b"a_value_from_cache", True)) - store._bigtable_get = MagicMock(return_value=None) + store._bigtable_get = MagicMock(return_value=(None, None)) res = store._get(self.TEST_KEY1) store._bigtable_get.assert_not_called() assert res == b"a_value_from_cache" # Scenario: Cache hit on None value store._get_cache = MagicMock(return_value=(None, True)) - store._bigtable_get = MagicMock(return_value=None) + store._bigtable_get = MagicMock(return_value=(None, None)) + res = store._get(self.TEST_KEY2) + store._bigtable_get.assert_not_called() + assert res is None + + # Scenario: Cache miss, but partition should be in startup cache + store._startup_cache_partitions = {19, 20} + store._get_cache = MagicMock(return_value=(None, False)) + store._bigtable_get = MagicMock(return_value=(None, None)) res = store._get(self.TEST_KEY2) store._bigtable_get.assert_not_called() assert res is None @@ -680,6 +688,7 @@ def test_setup_caches_startup_cache_disable(self, store): } store._setup_caches(options=options) assert store._startup_cache_enable is False - assert not hasattr(store, "_startup_cache_ttl") + assert store._startup_cache_ttl == 30 * 60 # Default value assert store._startup_cache is None - assert store._startup_cache_partitions is None + assert store._startup_cache_partitions == set() + assert store._startup_cache_enable is False From c4de292c75b1216f689c622dad7518c5d93c3b76 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 13 Nov 2023 12:46:59 +0100 Subject: [PATCH 588/616] added tests to increase coverage --- tests/unit/stores/test_bigtable.py | 56 ++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 654f2a920..34cb9d965 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -235,8 +235,18 @@ def store(self, bt_imports): store.bt_table = BigTableMock() return store - def test_bigtable_bigtable_get_on_empty(self, store, bt_imports): - return_value = store._bigtable_get([self.TEST_KEY1]) + def test_bigtable_get(self, store, bt_imports): + keys = [self.TEST_KEY1, self.TEST_KEY2] + for idx, k in enumerate(keys): + keys[idx] = store._add_partition_prefix_to_key(k, 0) + store.bt_table.add_test_data(keys) + value, partition = store._bigtable_get([keys[1]]) + store.bt_table.read_rows.assert_called_once() + assert partition == 0 + assert value == keys[1] + + def test_bigtable_get_on_empty(self, store, bt_imports): + return_value = store._bigtable_get([self.TEST_KEY1, self.TEST_KEY2]) store.bt_table.read_rows.assert_called_once() assert return_value == (None, None) @@ -692,3 +702,45 @@ def test_setup_caches_startup_cache_disable(self, store): assert store._startup_cache is None assert store._startup_cache_partitions == set() assert store._startup_cache_enable is False + + def test_set_del_get_cache(self, store): + store._startup_cache_enable = False + store._startup_cache = None + store._startup_cache_partitions = set() + + key = self.TEST_KEY1 + + store._set_cache(key, b"123") + res = store._get_cache(key) + assert store._startup_cache is None + assert store._startup_cache_partitions == set() + assert res == (None, False) + + store._del_cache(key) + res = store._get_cache(key) + assert res == (None, False) + assert store._startup_cache is None + assert store._startup_cache_partitions == set() + + # Now with enabled startup cache + store._startup_cache_enable = True + store._startup_cache = {} + store._startup_cache_partitions = {1, 2} + + store._set_cache(key, b"123") + res = store._get_cache(key) + assert store._startup_cache == {key: b"123"} + assert store._startup_cache_partitions == {1, 2} + assert res == (b"123", True) + store._del_cache(key) + res = store._get_cache(key) + assert store._startup_cache == {key: None} + assert store._startup_cache_partitions == {1, 2} + assert res == (None, True) + + def test_persisted_offset(self, store): + tp = TP("topic", 0) + offset_key = store.get_offset_key(tp).encode() + store.bt_table.data = {offset_key: b"1"} + print(store.persisted_offset(tp)) + assert store.persisted_offset(tp) == 1 From 782faf98459db3a1a22cc321cf76f935e34c3749 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 13 Nov 2023 12:56:36 +0100 Subject: [PATCH 589/616] added addtional unit tests --- tests/unit/stores/test_bigtable.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 34cb9d965..88bc2d120 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -742,5 +742,22 @@ def test_persisted_offset(self, store): tp = TP("topic", 0) offset_key = store.get_offset_key(tp).encode() store.bt_table.data = {offset_key: b"1"} - print(store.persisted_offset(tp)) + store._mutation_batcher = MagicMock(flush=MagicMock()) + + assert store.persisted_offset(tp) == 1 + store._mutation_batcher.flush.assert_not_called() + + store._mutation_batcher_enable = True assert store.persisted_offset(tp) == 1 + store._mutation_batcher.flush.assert_called_once() + + @pytest.mark.asyncio + async def test_stop(self, store): + store._mutation_batcher = MagicMock(flush=MagicMock()) + store._mutation_batcher_enable = False + await store.stop() + store._mutation_batcher.flush.assert_not_called() + + store._mutation_batcher_enable = True + await store.stop() + store._mutation_batcher.flush.assert_called_once() From b3adfa4b0b38ad69e68710c274db8b48e4974782 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Nov 2023 11:18:25 +0100 Subject: [PATCH 590/616] added mutation batcher cache --- faust/stores/bigtable.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1faf72bd8..d865cbc3b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -101,6 +101,7 @@ def _setup_mutation_batcher(self, options): self._mutation_batcher_enable = options.get( BigTableStore.BT_MUTATION_BATCHER_ENABLE_KEY, False ) + self._mutation_batcher_cache = {} if self._mutation_batcher_enable: flush_count = options.get( BigTableStore.BT_MUTATION_BATCHER_FLUSH_COUNT_KEY, 10_000 @@ -112,6 +113,7 @@ def _setup_mutation_batcher(self, options): self.bt_table, flush_count=flush_count, flush_interval=flush_interval, + batch_completed_callback=lambda x: self._mutation_batcher_cache.clear(), ) def _setup_caches( @@ -253,15 +255,16 @@ def _invalidate_startup_cache(self): del self._invalidation_timer self._invalidation_timer = None - def _set_mutation(self, mutated_row: DirectRow): + def _set_mutation(self, key: bytes, value: Optional[bytes], mutated_row: DirectRow): self._mutation_batcher.mutate(mutated_row) + self._mutation_batcher_cache[key] = value def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[int]]: rowset = BT.RowSet() for key in keys: + if self._mutation_batcher_enable and key in self._mutation_batcher_cache: + return self._mutation_batcher_cache[key] rowset.add_row_key(key) - if self._mutation_batcher_enable: - self._mutation_batcher.flush() rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) for row in rows: @@ -305,7 +308,7 @@ def _bigtable_set(self, key: bytes, value: bytes): ) if self._mutation_batcher_enable: - self._set_mutation(row) + self._set_mutation(key, value, row) else: row.commit() @@ -332,7 +335,7 @@ def _bigtable_del(self, key: bytes): row = self.bt_table.direct_row(key) row.delete() if self._mutation_batcher_enable: - self._set_mutation(row) + self._set_mutation(key, None, row) else: row.commit() From 04eada770dd0c5fcdbac0b740ad5ff59fdeecd6a Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Nov 2023 13:36:14 +0100 Subject: [PATCH 591/616] added unit tests for set mutation and for iterating global tables --- faust/stores/bigtable.py | 2 +- tests/unit/stores/test_bigtable.py | 30 +++++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d865cbc3b..7d7bb772b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -359,7 +359,7 @@ def _bigtable_iteritems(self, partitions): start = time.time() if partitions is None: partitions = self._active_partitions() - row_set = RowSet() + row_set = BT.RowSet() self.log.info( f"BigtableStore: Iterating over {len(partitions)} partitions " f"for table {self.table_name}" diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 88bc2d120..2a888252a 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -85,7 +85,10 @@ def _read_row(self, key: bytes, **kwargs): return row def _read_rows(self, row_set, **kwargs): - for k in row_set.keys: + iterator = row_set.keys + if len(iterator) == 0: + iterator = self.data.keys() + for k in iterator: res = None if b"_*_" in k: for key in self.data.keys(): @@ -265,7 +268,7 @@ def test_bigtable_delete(self, store): # Test with mutation buffer store._mutation_batcher_enable = True store._bigtable_del(self.TEST_KEY1) - store._set_mutation.assert_called_once_with(row_mock) + store._set_mutation.assert_called_once_with(self.TEST_KEY1, None, row_mock) assert row_mock.delete.call_count == 2 assert row_mock.commit.call_count == 1 @@ -284,7 +287,7 @@ def test_bigtable_set(self, store): # Test with mutation buffer store._mutation_batcher_enable = True store._bigtable_set(self.TEST_KEY1, "a_value") - store._set_mutation.assert_called_once_with(row_mock) + store._set_mutation.assert_called_once_with(self.TEST_KEY1, "a_value", row_mock) assert row_mock.set_cell.call_count == 2 assert row_mock.commit.call_count == 1 @@ -761,3 +764,24 @@ async def test_stop(self, store): store._mutation_batcher_enable = True await store.stop() store._mutation_batcher.flush.assert_called_once() + + def test_set_mutation(self, store): + store._mutation_batcher = MagicMock(flush=MagicMock()) + store._set_mutation(self.TEST_KEY1, b"123", MagicMock()) + store._mutation_batcher.flush.assert_not_called() + assert store._mutation_batcher_cache[self.TEST_KEY1] == b"123" + + def test_bigtable_iteritems_with_global_table(self, store, bt_imports): + store.table.is_global = True + store._active_partitions = MagicMock(return_value=[1, 3]) + # Add table to data fro partition 1 to 5 with corresponding offset keys + store.bt_table.data = {} + for i in range(1, 5): + key = store.get_offset_key(TP("topic", i)).encode() + store.bt_table.data[key] = str(i).encode() + tp_key = store._add_partition_prefix_to_key(f"key{i}".encode(), i) + store.bt_table.data[tp_key] = str(i).encode() + + res = sorted(store._iteritems()) + assert res == [(f"key{i}".encode(), str(i).encode()) for i in range(1, 5)] + store.bt_table.read_rows.assert_called_once() From bdf92e45ee06bcd46ab2e354ae3c4f2176d1205d Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Nov 2023 14:58:25 +0100 Subject: [PATCH 592/616] added more unit tests and also increased coverage --- faust/stores/bigtable.py | 14 +++++++------- tests/unit/stores/test_bigtable.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 7d7bb772b..0a2770cb6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -88,7 +88,7 @@ def __init__( self._setup_mutation_batcher(options) self.key_index_size = app.conf.table_key_index_size self._key_index = LRUCache(limit=self.key_index_size) - except Exception as ex: + except Exception as ex: # pragma: no cover logging.getLogger(__name__).error(f"Error in Bigtable init {ex}") raise ex super().__init__(url, app, table, **kwargs) @@ -293,7 +293,7 @@ def _get(self, key: bytes) -> Optional[bytes]: if value is not None: self._key_index[key] = partition return value - except Exception as ex: + except Exception as ex: # pragma: no cover self.log.error( f"Error in get for table {self.table_name} exception {ex} key {key}" ) @@ -323,7 +323,7 @@ def _set(self, key: bytes, value: bytes) -> None: self._bigtable_set(key, value) self._key_index[key] = partition - except Exception as ex: + except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error in set for " f"table {self.table_name} exception {ex} key {key=} " @@ -346,7 +346,7 @@ def _del(self, key: bytes) -> None: for partition in partitions: key_with_partition = self._add_partition_prefix_to_key(key, partition) self._bigtable_del(key_with_partition) - except Exception as ex: + except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error in del for " f"table {self.table_name} exception {ex} key {key=} " @@ -390,7 +390,7 @@ def _bigtable_iteritems(self, partitions): f"{self.table_name} _bigtable_iteritems took {end - start}s " f"for partitions {partitions}" ) - except Exception as ex: + except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error " f"in _iteritems for table {self.table_name}" @@ -429,7 +429,7 @@ def _contains(self, key: bytes) -> bool: if not self.app.conf.store_check_exists: return True return self._get(key) is not None - except Exception as ex: + except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error in _contains for table " f"{self.table_name} exception " @@ -479,7 +479,7 @@ def set_persisted_offset(self, tp: TP, offset: int) -> None: try: offset_key = self.get_offset_key(tp).encode() self._bigtable_set(offset_key, str(offset).encode()) - except Exception: + except Exception: # pragma: no cover self.log.error( f"Failed to commit offset for {self.table.name}" " -> will cause additional changelogs if restart happens" diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 2a888252a..beaa67898 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -48,10 +48,18 @@ def __init__(self) -> None: self.keys = set() self.add_row_key = MagicMock(wraps=self._add_row_key) self.add_row_range_from_keys = MagicMock(wraps=self._add_row_range_from_keys) + self.add_row_range_with_prefix = MagicMock( + wraps=self._add_row_range_with_prefix + ) def _add_row_key(self, key): self.keys.add(key) + def _add_row_range_with_prefix(self, prefix): + if isinstance(prefix, str): + prefix = prefix.encode() + self._add_row_range_from_keys(prefix, prefix, end_inclusive=True) + def _add_row_range_from_keys( self, start_key: bytes, end_key: bytes, end_inclusive=False ): @@ -785,3 +793,22 @@ def test_bigtable_iteritems_with_global_table(self, store, bt_imports): res = sorted(store._iteritems()) assert res == [(f"key{i}".encode(), str(i).encode()) for i in range(1, 5)] store.bt_table.read_rows.assert_called_once() + + def test_bigtable_iteritems_with_global_table2(self, store, bt_imports): + store.table.is_global = False + store.table.use_partitioner = False + store._mutation_batcher_enable = True + store._mutation_batcher = MagicMock(flush=MagicMock()) + store._active_partitions = MagicMock(return_value={1, 3}) + # Add table to data fro partition 1 to 5 with corresponding offset keys + store.bt_table.data = {} + for i in range(1, 5): + key = store.get_offset_key(TP("topic", i)).encode() + store.bt_table.data[key] = str(i).encode() + tp_key = store._add_partition_prefix_to_key(f"key{i}".encode(), i) + store.bt_table.data[tp_key] = str(i).encode() + + res = sorted(store._iteritems()) + assert res == [(f"key{i}".encode(), str(i).encode()) for i in [1, 3]] + store.bt_table.read_rows.assert_called_once() + store._mutation_batcher.flush.assert_called_once() From 6b5d7090146c1b5b8ac711d16c82e2d40f176d5c Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 14 Nov 2023 15:18:35 +0100 Subject: [PATCH 593/616] removed unused import --- tests/unit/stores/test_bigtable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index beaa67898..7a0160156 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,5 +1,4 @@ from unittest.mock import MagicMock, call, patch -import time import pytest From 608a6a5246acbf6023bdf444030494560522f560 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Nov 2023 13:38:00 +0100 Subject: [PATCH 594/616] moved log for assigning partitions --- faust/stores/bigtable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0a2770cb6..ac37e121b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -575,13 +575,13 @@ async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: # Fill cache with all keys for the partitions we are assigned + partitions = self._get_active_changelogtopic_partitions(table, tps) + self.log.info(f"Assigning partitions {partitions} for {table.name}") if self._startup_cache_enable is False: return - partitions = self._get_active_changelogtopic_partitions(table, tps) if len(partitions) == 0: return - self.log.info(f"Assigning partitions {partitions} for {table.name}") self._fill_caches(partitions) async def on_rebalance( From 3e4ab6568028720153eba25268fc300457fd72d6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Nov 2023 13:41:00 +0100 Subject: [PATCH 595/616] removed unnecesery abort on _get --- faust/stores/bigtable.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ac37e121b..cca84d752 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -287,8 +287,6 @@ def _get(self, key: bytes) -> Optional[bytes]: return None keys = [self._add_partition_prefix_to_key(key, p) for p in partitions] - if len(keys) == 0: - return None value, partition = self._bigtable_get(keys) if value is not None: self._key_index[key] = partition From 8a6cec18c34032e99a142c4ffe2501d361898798 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 15 Nov 2023 13:45:08 +0100 Subject: [PATCH 596/616] fix naming of extract row data function --- faust/stores/bigtable.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index cca84d752..66e3fe3a5 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -228,7 +228,7 @@ def _get_partitions_for_key(self, key: bytes) -> List[int]: return self._get_current_partitions() @staticmethod - def bigtable_exrtact_row_data(row_data): + def bigtable_extract_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _del_cache(self, key: bytes): @@ -270,7 +270,7 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in for row in rows: if row is not None: partition = self._get_partition_from_bigtable_key(row.row_key) - return self.bigtable_exrtact_row_data(row), partition + return self.bigtable_extract_row_data(row), partition return None, None def _get(self, key: bytes) -> Optional[bytes]: @@ -380,7 +380,7 @@ def _bigtable_iteritems(self, partitions): if need_all_keys and offset_key_prefix in row.row_key: continue - value = self.bigtable_exrtact_row_data(row) + value = self.bigtable_extract_row_data(row) key = self._remove_partition_prefix_from_bigtable_key(row.row_key) yield key, value end = time.time() @@ -463,7 +463,7 @@ def persisted_offset(self, tp: TP) -> Optional[int]: if self._mutation_batcher_enable: self._mutation_batcher.flush() row = self.bt_table.read_row(offset_key, filter_=self.row_filter) - offset = self.bigtable_exrtact_row_data(row) if row is not None else None + offset = self.bigtable_extract_row_data(row) if row is not None else None return int(offset) if offset is not None else None def set_persisted_offset(self, tp: TP, offset: int) -> None: From 6079a65d3b920c28d140029c394aedc5bfb0f43f Mon Sep 17 00:00:00 2001 From: jpesenhofer <48673114+jpesenhofer@users.noreply.github.com> Date: Mon, 20 Nov 2023 13:53:44 +0100 Subject: [PATCH 597/616] Update install --- scripts/install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install b/scripts/install index 3f4390bc4..c0e202e86 100755 --- a/scripts/install +++ b/scripts/install @@ -4,7 +4,7 @@ [ "$1" = "-p" ] && PYTHON=$2 || PYTHON="python3" REQUIREMENTS="requirements/test.txt" -VENV="env-faust" +VENV="venv" set -x From c30e7f5f7502d046bbb1680d6324701beceffbcf Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Mon, 27 Nov 2023 09:41:13 +0100 Subject: [PATCH 598/616] fixed return value and added testcase --- faust/stores/bigtable.py | 3 ++- tests/unit/stores/test_bigtable.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 66e3fe3a5..90bee9334 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -263,7 +263,8 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in rowset = BT.RowSet() for key in keys: if self._mutation_batcher_enable and key in self._mutation_batcher_cache: - return self._mutation_batcher_cache[key] + partition = self._get_partition_from_bigtable_key(key) + return self._mutation_batcher_cache[key], partition rowset.add_row_key(key) rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 7a0160156..a53bde0c0 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -250,11 +250,21 @@ def test_bigtable_get(self, store, bt_imports): for idx, k in enumerate(keys): keys[idx] = store._add_partition_prefix_to_key(k, 0) store.bt_table.add_test_data(keys) + + # Test get from bigtable value, partition = store._bigtable_get([keys[1]]) store.bt_table.read_rows.assert_called_once() assert partition == 0 assert value == keys[1] + # Test get from mutation buffer + store._mutation_batcher_enable = True + store._mutation_batcher_cache = {keys[1]: b"123"} + value, partition = store._bigtable_get([keys[1]]) + store.bt_table.read_rows.assert_called_once() + assert value == b"123" + assert partition == 0 + def test_bigtable_get_on_empty(self, store, bt_imports): return_value = store._bigtable_get([self.TEST_KEY1, self.TEST_KEY2]) store.bt_table.read_rows.assert_called_once() From 5291ac0ee2e06547e0294ed46c3cea34e6aab25e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 30 Nov 2023 11:53:01 +0100 Subject: [PATCH 599/616] fixed bug with mutation batcher in get requests if a delete mutation was found --- faust/stores/bigtable.py | 7 ++++++- tests/unit/stores/test_bigtable.py | 28 +++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 90bee9334..03bbb9f1c 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -264,7 +264,12 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in for key in keys: if self._mutation_batcher_enable and key in self._mutation_batcher_cache: partition = self._get_partition_from_bigtable_key(key) - return self._mutation_batcher_cache[key], partition + value = self._mutation_batcher_cache[key] + if value is not None: + # Since deletes can happen async we need to make sure + # that we don't return a value for a delete that happened on + # another partition + return value, partition rowset.add_row_key(key) rows = self.bt_table.read_rows(row_set=rowset, filter_=self.row_filter) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index a53bde0c0..39de16036 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -248,13 +248,13 @@ def store(self, bt_imports): def test_bigtable_get(self, store, bt_imports): keys = [self.TEST_KEY1, self.TEST_KEY2] for idx, k in enumerate(keys): - keys[idx] = store._add_partition_prefix_to_key(k, 0) + keys[idx] = store._add_partition_prefix_to_key(k, 2) store.bt_table.add_test_data(keys) # Test get from bigtable value, partition = store._bigtable_get([keys[1]]) store.bt_table.read_rows.assert_called_once() - assert partition == 0 + assert partition == 2 assert value == keys[1] # Test get from mutation buffer @@ -263,7 +263,7 @@ def test_bigtable_get(self, store, bt_imports): value, partition = store._bigtable_get([keys[1]]) store.bt_table.read_rows.assert_called_once() assert value == b"123" - assert partition == 0 + assert partition == 2 def test_bigtable_get_on_empty(self, store, bt_imports): return_value = store._bigtable_get([self.TEST_KEY1, self.TEST_KEY2]) @@ -821,3 +821,25 @@ def test_bigtable_iteritems_with_global_table2(self, store, bt_imports): assert res == [(f"key{i}".encode(), str(i).encode()) for i in [1, 3]] store.bt_table.read_rows.assert_called_once() store._mutation_batcher.flush.assert_called_once() + + def test_get_after_delete(self, store, bt_imports): + partitions = [19, 20] + store._get_cache = MagicMock(return_value=(b"this is ignored", False)) + store._key_index = {} + store._get_current_partitions = MagicMock(return_value=partitions) + row_mock = MagicMock() + row_mock.commit = MagicMock() + row_mock.delete = MagicMock() + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store.bt_table.direct_row = MagicMock(return_value=row_mock) + store._mutation_batcher_enable = True + + key_right = b"20_..._" + self.TEST_KEY1 + key_wrong = b"19_..._" + self.TEST_KEY1 + + # This is the case if a delete happened before + store._mutation_batcher_cache = {key_right: b"123", key_wrong: None} + store.bt_table.add_test_data(key_right) + + res = store._get(self.TEST_KEY1) + assert res is not None From 06b39978c35ceb1d0c923bf715fc775aaff7cd4f Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 1 Dec 2023 11:01:39 +0100 Subject: [PATCH 600/616] added partition revoking and log --- faust/stores/bigtable.py | 7 +++++++ tests/unit/stores/test_bigtable.py | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 03bbb9f1c..d2721bc7a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -588,6 +588,12 @@ async def assign_partitions( return self._fill_caches(partitions) + def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: + partitions = self._get_active_changelogtopic_partitions(table, tps) + self._startup_cache_partitions.difference_update(partitions) + # The memory of the startup cache will be freed after the ttl is over + self.log.info(f"Revoking partitions {partitions} for {table.name}") + async def on_rebalance( self, assigned: Set[TP], @@ -604,6 +610,7 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ + self.revoke_partitions(self.table, revoked) await self.assign_partitions(self.table, newly_assigned, generation_id) async def stop(self) -> None: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 39de16036..f06b542fd 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -654,6 +654,8 @@ async def test__get_active_changelogtopic_partitions(self, store): @pytest.mark.asyncio async def test_bigtable_on_rebalance(self, store, bt_imports): store.assign_partitions = MagicMock(wraps=store.assign_partitions) + store.revoke_partitions = MagicMock(wraps=store.revoke_partitions) + tps_table = { "topic1", "topic2", @@ -670,6 +672,7 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): store._startup_cache_enable = False await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=1) store.assign_partitions.assert_called_once_with(store.table, newly_assigned, 1) + store.revoke_partitions.assert_called_once_with(store.table, revoked) store._fill_caches.assert_not_called() newly_assigned = set() @@ -685,6 +688,14 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) store._fill_caches.assert_called_once_with({3, 4}) + def test_revoke_partitions(self, store): + store._startup_cache_partitions = {1, 2, 3} + store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} + revoked = {TP("topic", 1), TP("topic", 2)} + store.table = MagicMock(changelog_topic=MagicMock(topics={"topic"})) + store.revoke_partitions(store.table, revoked) + assert store._startup_cache_partitions == {3} + def test_contains(self, store, bt_imports): store._get = MagicMock(return_value=b"test_value") From 66df010596bab82b2c432eb143d60376a8669772 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 12 Dec 2023 13:17:15 +0100 Subject: [PATCH 601/616] revoke all tps also the ones that are not active This is just for logging for now. The partitions that are revoked don't neet to be active. In order to log it correctly, we need to replace the call to _get_active_changelogtopic_partitions --- faust/stores/bigtable.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index d2721bc7a..0ca8ece2a 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -589,7 +589,10 @@ async def assign_partitions( self._fill_caches(partitions) def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: - partitions = self._get_active_changelogtopic_partitions(table, tps) + partitions = set() + for tp in tps: + if tp.topic in table.changelog_topic.topics: + partitions.add(tp.partition) self._startup_cache_partitions.difference_update(partitions) # The memory of the startup cache will be freed after the ttl is over self.log.info(f"Revoking partitions {partitions} for {table.name}") From 3266ae76cf1cc6203d1695031fe60143019bfcc6 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Fri, 15 Dec 2023 09:40:33 +0100 Subject: [PATCH 602/616] moved logging and added todo --- faust/stores/bigtable.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 0ca8ece2a..31a4622c6 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -580,12 +580,12 @@ async def assign_partitions( ) -> None: # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) - self.log.info(f"Assigning partitions {partitions} for {table.name}") if self._startup_cache_enable is False: return if len(partitions) == 0: return + self.log.info(f"Assigning partitions {partitions} for {table.name}") self._fill_caches(partitions) def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: @@ -593,8 +593,13 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: for tp in tps: if tp.topic in table.changelog_topic.topics: partitions.add(tp.partition) + + if len(partitions) == 0: + return + self._startup_cache_partitions.difference_update(partitions) # The memory of the startup cache will be freed after the ttl is over + # TODO: Free memory that is not needed instantly self.log.info(f"Revoking partitions {partitions} for {table.name}") async def on_rebalance( From a6e5e0fdf7653a10c99fd1e2ff99cc4fcbde129e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 09:07:09 +0100 Subject: [PATCH 603/616] added option and added tests also set the default value of startup cache ttl to - 1 --- faust/stores/bigtable.py | 18 +++++++++--------- tests/unit/stores/test_bigtable.py | 26 +++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 31a4622c6..6b88abbbc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -127,12 +127,9 @@ def _setup_caches( self._startup_cache = None self._startup_cache_partitions: Set[int] = set() self._startup_cache_ttl = options.get( - BigTableStore.BT_STARTUP_CACHE_TTL_KEY, 30 * 60 + BigTableStore.BT_STARTUP_CACHE_TTL_KEY, -1 ) if self._startup_cache_enable: - if self._startup_cache_ttl <= 0: - return - self._startup_cache: Dict[bytes, bytes] = {} self._invalidation_timer: Optional[threading.Timer] = None @@ -553,16 +550,19 @@ def _fill_caches(self, partitions): self._set_cache(k, v) self._startup_cache_partitions |= set(partitions) - # Invalidate startup cache after 30 minutes + # Invalidate startup cache after self._startup_cache_ttl # or reset the timer if already running if self._invalidation_timer is not None: self._invalidation_timer.cancel() del self._invalidation_timer self._invalidation_timer = None - self._invalidation_timer = threading.Timer( - self._startup_cache_ttl, self._invalidate_startup_cache - ) - self._invalidation_timer.start() + + if self._startup_cache_ttl > 0: + # if _startup_cache_ttl < 0 keep cache forever + self._invalidation_timer = threading.Timer( + self._startup_cache_ttl, self._invalidate_startup_cache + ) + self._invalidation_timer.start() def _get_active_changelogtopic_partitions( self, table: CollectionT, tps: Set[TP] diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index f06b542fd..97790e6d5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -636,6 +636,30 @@ async def test_fill_caches(self, store, bt_imports): assert store._startup_cache_partitions == set() assert store._invalidation_timer is None + @pytest.mark.asyncio + async def test_fill_caches_no_ttl(self, store, bt_imports): + store._bigtable_iteritems = MagicMock( + return_value=[(b"key1", b"value1"), (b"key2", b"value2")] + ) + store._set_cache = MagicMock() + store._startup_cache_ttl = 0 + store._invalidation_timer = None + store._startup_cache_partitions = set() + store._startup_cache = {} + + partitions = {TP("topic", 0), TP("topic", 1)} + partitions2 = {TP("topic", 0), TP("topic", 2)} + + store._fill_caches(partitions) + + assert store._bigtable_iteritems.call_args == call(partitions=partitions) + assert store._set_cache.call_args_list == [ + call(b"key1", b"value1"), + call(b"key2", b"value2"), + ] + assert store._startup_cache_partitions == partitions + assert store._invalidation_timer is None + @pytest.mark.asyncio async def test__get_active_changelogtopic_partitions(self, store): tps_table = { @@ -729,7 +753,7 @@ def test_setup_caches_startup_cache_disable(self, store): } store._setup_caches(options=options) assert store._startup_cache_enable is False - assert store._startup_cache_ttl == 30 * 60 # Default value + assert store._startup_cache_ttl == -1 # Default value assert store._startup_cache is None assert store._startup_cache_partitions == set() assert store._startup_cache_enable is False From 4f506762855ab45b5606f1d9d6817d477408d2d7 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 13:31:30 +0100 Subject: [PATCH 604/616] updated unit tests and bigtable store This update enhances the startup cache with partitions. It means, that now with every access to the cache a partition needs to be specified. This change enables, that the additional memory is freed faster on a rebalance and that additional partitions are not assigned redundantly --- faust/stores/bigtable.py | 64 ++++++++++-------- tests/unit/stores/test_bigtable.py | 100 ++++++++++++++--------------- 2 files changed, 85 insertions(+), 79 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6b88abbbc..9f02df13f 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -125,12 +125,11 @@ def _setup_caches( ) self._startup_cache = None - self._startup_cache_partitions: Set[int] = set() self._startup_cache_ttl = options.get( BigTableStore.BT_STARTUP_CACHE_TTL_KEY, -1 ) if self._startup_cache_enable: - self._startup_cache: Dict[bytes, bytes] = {} + self._startup_cache: Dict[int, Dict[bytes, bytes]] = {} self._invalidation_timer: Optional[threading.Timer] = None def _set_options(self, options) -> None: @@ -230,22 +229,26 @@ def bigtable_extract_row_data(row_data): def _del_cache(self, key: bytes): if self._startup_cache is not None: - self._startup_cache[key] = None + for partition in self._startup_cache: + self._startup_cache[partition][key] = None - def _set_cache(self, key: bytes, value): + def _set_cache(self, partition: int, key: bytes, value): if self._startup_cache is not None: - self._startup_cache[key] = value + self._startup_cache[partition][key] = value - def _get_cache(self, key: bytes): + def _get_cache(self, partition: int, key: bytes): if self._startup_cache_enable and self._startup_cache is not None: - if key in self._startup_cache: - return self._startup_cache[key], True + if partition not in self._startup_cache: + return None, False + if key in self._startup_cache[partition]: + return self._startup_cache[partition][key], True return None, False def _invalidate_startup_cache(self): if self._startup_cache is not None: + for partition in self._startup_cache: + self._startup_cache[partition].clear() self._startup_cache.clear() - self._startup_cache_partitions = set() gc.collect() self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() @@ -278,13 +281,13 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - value, found = self._get_cache(key) - if found: - return value - partitions = set(self._get_partitions_for_key(key)) + for partition in partitions: + value, found = self._get_cache(partition, key) + if found: + return value # Remove partitions that we already have in cache - partitions.difference_update(self._startup_cache_partitions) + partitions.difference_update(self._startup_cache or {}) # Nothing todo if len(partitions) == 0: return None @@ -315,11 +318,10 @@ def _bigtable_set(self, key: bytes, value: bytes): def _set(self, key: bytes, value: bytes) -> None: try: - self._set_cache(key, value) - event = current_event() assert event is not None partition = event.message.partition + self._set_cache(partition, key, value) key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) @@ -404,11 +406,13 @@ def _iteritems( ) -> Iterator[Tuple[bytes, bytes]]: if self._startup_cache is not None: if partitions is None: - partitions = set(self._active_partitions()) - for k, v in self._startup_cache.items(): - if v is not None: - yield k, v - partitions.difference_update(self._startup_cache_partitions) + active_partitions = set(self._active_partitions()) + cache_partitions = active_partitions.intersection(self._startup_cache) + for p in cache_partitions: + for k, v in self._startup_cache[p].items(): + if v is not None: + yield k, v + partitions = list(active_partitions.difference(cache_partitions)) if partitions is None or len(partitions) > 0: yield from self._bigtable_iteritems(partitions) @@ -515,7 +519,7 @@ def apply_changelog_batch( self._del_cache(msg.key) self._bigtable_del(bt_key) else: - self._set_cache(msg.key, msg.value) + self._set_cache(msg.partition, msg.key, msg.value) self._bigtable_set(bt_key, msg.value) for tp, offset in tp_offsets.items(): @@ -546,10 +550,13 @@ def restore_backup( raise NotImplementedError("Not yet implemented for Bigtable.") def _fill_caches(self, partitions): - for k, v in self._bigtable_iteritems(partitions=partitions): - self._set_cache(k, v) + partitions.difference_update(self._startup_cache) + for partition in partitions: + if partition not in self._startup_cache: + self._startup_cache[partition] = {} + for k, v in self._bigtable_iteritems(partitions={partition}): + self._set_cache(partition, k, v) - self._startup_cache_partitions |= set(partitions) # Invalidate startup cache after self._startup_cache_ttl # or reset the timer if already running if self._invalidation_timer is not None: @@ -597,9 +604,10 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if len(partitions) == 0: return - self._startup_cache_partitions.difference_update(partitions) - # The memory of the startup cache will be freed after the ttl is over - # TODO: Free memory that is not needed instantly + for partition in partitions.intersection(self._startup_cache or {}): + if partition in self._startup_cache: + self._startup_cache[partition].clear() + del self._startup_cache[partition] self.log.info(f"Revoking partitions {partitions} for {table.name}") async def on_rebalance( diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 97790e6d5..8f7a797b5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -400,11 +400,10 @@ def test_get_with_known_partition(self, store): assert res is None # Scenario: Cache miss, but partition should be in startup cache - store._startup_cache_partitions = {19, 20} store._get_cache = MagicMock(return_value=(None, False)) store._bigtable_get = MagicMock(return_value=(None, None)) res = store._get(self.TEST_KEY2) - store._bigtable_get.assert_not_called() + store._bigtable_get.assert_called_once() assert res is None def test_set(self, store): @@ -427,7 +426,7 @@ def test_set(self, store): store._set(self.TEST_KEY1, b"a_value") key = store._add_partition_prefix_to_key(self.TEST_KEY1, 69) - store._set_cache.assert_called_with(self.TEST_KEY1, b"a_value") + store._set_cache.assert_called_with(69, self.TEST_KEY1, b"a_value") store._bigtable_set.assert_called_once_with(key, b"a_value") def test_del(self, store): @@ -478,12 +477,12 @@ def test_iteritems(self, store): def test_iteritems_with_startup_cache(self, store, bt_imports): store._active_partitions = MagicMock(return_value=[1, 3]) - store._startup_cache = { + store._startup_cache = {} + store._startup_cache[1] = { self.TEST_KEY1: b"this is a value", self.TEST_KEY2: b"this is another value", - b"Dont return this": None, + b"Dont return this, because this is a offset key": None, } - store._startup_cache_partitions = [1] store._bigtable_iteritems = MagicMock(wraps=store._bigtable_iteritems) store.bt_table.read_rows = MagicMock( @@ -501,7 +500,7 @@ def test_iteritems_with_startup_cache(self, store, bt_imports): ] ) res = sorted(store._iteritems()) - store._bigtable_iteritems.assert_called_once_with({3}) + store._bigtable_iteritems.assert_called_once_with([3]) all_entries = { self.TEST_KEY1: b"this is a value", self.TEST_KEY2: b"this is another value", @@ -582,25 +581,26 @@ def __init__(self, message): @pytest.mark.asyncio async def test_fill_caches(self, store, bt_imports): store._bigtable_iteritems = MagicMock( - return_value=[(b"key1", b"value1"), (b"key2", b"value2")] + side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] ) store._set_cache = MagicMock() store._startup_cache_ttl = 1800 store._invalidation_timer = None - store._startup_cache_partitions = set() store._startup_cache = {} - partitions = {TP("topic", 0), TP("topic", 1)} - partitions2 = {TP("topic", 0), TP("topic", 2)} + partitions = {0, 1} + partitions2 = {0, 2} store._fill_caches(partitions) + calls = [call(partitions={p}) for p in partitions] + store._bigtable_iteritems.assert_has_calls(calls) - assert store._bigtable_iteritems.call_args == call(partitions=partitions) - assert store._set_cache.call_args_list == [ - call(b"key1", b"value1"), - call(b"key2", b"value2"), - ] - assert store._startup_cache_partitions == partitions + store._set_cache.assert_has_calls( + [ + call(0, b"key1", b"value1"), + call(1, b"key2", b"value2"), + ] + ) assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() @@ -609,7 +609,7 @@ async def test_fill_caches(self, store, bt_imports): old_invalid_timer = store._invalidation_timer.__hash__() store._bigtable_iteritems = MagicMock( - return_value=[(b"key3", b"value3"), (b"key4", b"value4")] + side_effect=[[(b"key3", b"value3")], [(b"key4", b"value4")]] ) store._set_cache = MagicMock() store._fill_caches(partitions2) @@ -619,12 +619,16 @@ async def test_fill_caches(self, store, bt_imports): assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() - assert store._bigtable_iteritems.call_args == call(partitions=partitions2) - assert store._set_cache.call_args_list == [ - call(b"key3", b"value3"), - call(b"key4", b"value4"), - ] - assert store._startup_cache_partitions == partitions | partitions2 + store._bigtable_iteritems.assert_called_with(partitions={2}) + assert store._bigtable_iteritems.call_count == 1 + store._set_cache.assert_has_calls( + [ + # Key 4 is ignored because it should already be loaded. + # Because in our scenario the second key is never returned + # call(0, b"key4", b"value4"), + call(2, b"key3", b"value3"), + ] + ) assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() @@ -633,31 +637,32 @@ async def test_fill_caches(self, store, bt_imports): store._invalidate_startup_cache() assert store._startup_cache == {} - assert store._startup_cache_partitions == set() assert store._invalidation_timer is None @pytest.mark.asyncio async def test_fill_caches_no_ttl(self, store, bt_imports): store._bigtable_iteritems = MagicMock( - return_value=[(b"key1", b"value1"), (b"key2", b"value2")] + side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] ) store._set_cache = MagicMock() store._startup_cache_ttl = 0 store._invalidation_timer = None - store._startup_cache_partitions = set() store._startup_cache = {} - partitions = {TP("topic", 0), TP("topic", 1)} - partitions2 = {TP("topic", 0), TP("topic", 2)} + partitions = {0, 1} store._fill_caches(partitions) - assert store._bigtable_iteritems.call_args == call(partitions=partitions) + store._set_cache.assert_has_calls( + [ + call(0, b"key1", b"value1"), + call(1, b"key2", b"value2"), + ] + ) assert store._set_cache.call_args_list == [ - call(b"key1", b"value1"), - call(b"key2", b"value2"), + call(0, b"key1", b"value1"), + call(1, b"key2", b"value2"), ] - assert store._startup_cache_partitions == partitions assert store._invalidation_timer is None @pytest.mark.asyncio @@ -713,12 +718,10 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): store._fill_caches.assert_called_once_with({3, 4}) def test_revoke_partitions(self, store): - store._startup_cache_partitions = {1, 2, 3} store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} revoked = {TP("topic", 1), TP("topic", 2)} store.table = MagicMock(changelog_topic=MagicMock(topics={"topic"})) store.revoke_partitions(store.table, revoked) - assert store._startup_cache_partitions == {3} def test_contains(self, store, bt_imports): store._get = MagicMock(return_value=b"test_value") @@ -744,7 +747,6 @@ def test_setup_caches_startup_cache_enable(self, store): assert store._startup_cache_enable is True assert store._startup_cache_ttl == 60 assert isinstance(store._startup_cache, dict) - assert isinstance(store._startup_cache_partitions, set) assert store._invalidation_timer is None def test_setup_caches_startup_cache_disable(self, store): @@ -755,42 +757,38 @@ def test_setup_caches_startup_cache_disable(self, store): assert store._startup_cache_enable is False assert store._startup_cache_ttl == -1 # Default value assert store._startup_cache is None - assert store._startup_cache_partitions == set() assert store._startup_cache_enable is False def test_set_del_get_cache(self, store): store._startup_cache_enable = False store._startup_cache = None - store._startup_cache_partitions = set() + partition = 1 key = self.TEST_KEY1 - store._set_cache(key, b"123") - res = store._get_cache(key) + store._set_cache(partition, key, b"123") + res = store._get_cache(partition, key) assert store._startup_cache is None - assert store._startup_cache_partitions == set() assert res == (None, False) store._del_cache(key) - res = store._get_cache(key) + res = store._get_cache(partition, key) assert res == (None, False) assert store._startup_cache is None - assert store._startup_cache_partitions == set() # Now with enabled startup cache store._startup_cache_enable = True store._startup_cache = {} - store._startup_cache_partitions = {1, 2} + store._startup_cache[partition] = {} - store._set_cache(key, b"123") - res = store._get_cache(key) - assert store._startup_cache == {key: b"123"} - assert store._startup_cache_partitions == {1, 2} + store._set_cache(partition, key, b"123") + res = store._get_cache(partition, key) + assert partition in store._startup_cache + assert store._startup_cache[partition] == {key: b"123"} assert res == (b"123", True) store._del_cache(key) - res = store._get_cache(key) - assert store._startup_cache == {key: None} - assert store._startup_cache_partitions == {1, 2} + res = store._get_cache(partition, key) + assert store._startup_cache[partition] == {key: None} assert res == (None, True) def test_persisted_offset(self, store): From 32dc779f2be2ba4a51ff289293f41026685a8518 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 14:29:03 +0100 Subject: [PATCH 605/616] added some logs --- faust/stores/bigtable.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 9f02df13f..42646d94b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -587,12 +587,9 @@ async def assign_partitions( ) -> None: # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) - if self._startup_cache_enable is False: - return - - if len(partitions) == 0: - return self.log.info(f"Assigning partitions {partitions} for {table.name}") + if len(partitions) == 0 or self._startup_cache_enable is False: + return self._fill_caches(partitions) def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: @@ -601,7 +598,7 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if tp.topic in table.changelog_topic.topics: partitions.add(tp.partition) - if len(partitions) == 0: + if len(partitions) == 0 or self._startup_cache_enable is False: return for partition in partitions.intersection(self._startup_cache or {}): @@ -626,8 +623,10 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - self.revoke_partitions(self.table, revoked) - await self.assign_partitions(self.table, newly_assigned, generation_id) + if len(revoked) > 0: + self.revoke_partitions(self.table, revoked) + if len(assigned) > 0: + await self.assign_partitions(self.table, newly_assigned, generation_id) async def stop(self) -> None: if self._mutation_batcher_enable: From d8b2cce436b4365d82b5b407e70d79101b24d999 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 14:29:59 +0100 Subject: [PATCH 606/616] added gc collect --- faust/stores/bigtable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 42646d94b..6a904587d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -605,6 +605,7 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if partition in self._startup_cache: self._startup_cache[partition].clear() del self._startup_cache[partition] + gc.collect() self.log.info(f"Revoking partitions {partitions} for {table.name}") async def on_rebalance( From 4b7384b5bd85a99803c50a61d70b62b8e583ab6b Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 14:49:44 +0100 Subject: [PATCH 607/616] introduced locking on rebalance and fixed unit tests --- faust/stores/bigtable.py | 33 ++++++++++++++++++------------ tests/unit/stores/test_bigtable.py | 20 ++++++++++++++---- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6a904587d..941fbe73d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,4 +1,5 @@ """BigTable storage.""" +import asyncio import gc import logging import time @@ -82,6 +83,8 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) + self.db_lock = asyncio.Lock() + self.rebalance_ack = False try: self._setup_bigtable(table, options) self._setup_caches(options) @@ -549,13 +552,12 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - def _fill_caches(self, partitions): - partitions.difference_update(self._startup_cache) - for partition in partitions: - if partition not in self._startup_cache: - self._startup_cache[partition] = {} - for k, v in self._bigtable_iteritems(partitions={partition}): - self._set_cache(partition, k, v) + def _fill_caches(self, partition): + if partition in self._startup_cache: + return + self._startup_cache[partition] = {} + for k, v in self._bigtable_iteritems(partitions={partition}): + self._set_cache(partition, k, v) # Invalidate startup cache after self._startup_cache_ttl # or reset the timer if already running @@ -578,19 +580,22 @@ def _get_active_changelogtopic_partitions( standby_tps = self.app.assignor.assigned_standbys() my_topics = table.changelog_topic.topics for tp in tps: - if tp.topic in my_topics and tp not in standby_tps: + if tp.topic in my_topics and tp not in standby_tps and self.rebalance_ack: partitions.add(tp.partition) return partitions async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: + self.rebalance_ack = True # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) self.log.info(f"Assigning partitions {partitions} for {table.name}") if len(partitions) == 0 or self._startup_cache_enable is False: return - self._fill_caches(partitions) + for partition in partitions: + self._fill_caches(partition) + await asyncio.sleep(0) def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() @@ -624,10 +629,12 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - if len(revoked) > 0: - self.revoke_partitions(self.table, revoked) - if len(assigned) > 0: - await self.assign_partitions(self.table, newly_assigned, generation_id) + self.rebalance_ack = False + async with self.db_lock: + if len(revoked) > 0: + self.revoke_partitions(self.table, revoked) + if len(assigned) > 0: + await self.assign_partitions(self.table, newly_assigned, generation_id) async def stop(self) -> None: if self._mutation_batcher_enable: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 8f7a797b5..af6eaaff5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -591,7 +591,8 @@ async def test_fill_caches(self, store, bt_imports): partitions = {0, 1} partitions2 = {0, 2} - store._fill_caches(partitions) + for partition in partitions: + store._fill_caches(partition) calls = [call(partitions={p}) for p in partitions] store._bigtable_iteritems.assert_has_calls(calls) @@ -612,7 +613,8 @@ async def test_fill_caches(self, store, bt_imports): side_effect=[[(b"key3", b"value3")], [(b"key4", b"value4")]] ) store._set_cache = MagicMock() - store._fill_caches(partitions2) + for p in partitions2: + store._fill_caches(p) new_invalid_timer = store._invalidation_timer.__hash__() # Check if old invalidation timer is different from new one assert old_invalid_timer != new_invalid_timer @@ -651,7 +653,8 @@ async def test_fill_caches_no_ttl(self, store, bt_imports): partitions = {0, 1} - store._fill_caches(partitions) + for p in partitions: + store._fill_caches(p) store._set_cache.assert_has_calls( [ @@ -675,6 +678,15 @@ async def test__get_active_changelogtopic_partitions(self, store): store.table = MagicMock(changelog_topic=MagicMock(topics=tps_table)) tps = {TP("changelog_topic", 0), TP("other_topic", 1)} + # Scenario 1: With no rebalance_ack + store.rebalance_ack = False + active_partitions = store._get_active_changelogtopic_partitions( + store.table, tps + ) + assert active_partitions == set() + + # Scenario 2: With no rebalance_ack + store.rebalance_ack = True active_partitions = store._get_active_changelogtopic_partitions( store.table, tps ) @@ -715,7 +727,7 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): newly_assigned = {TP("topic4", 3), TP("topic5", 4)} await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) - store._fill_caches.assert_called_once_with({3, 4}) + store._fill_caches.assert_has_calls([call(3), call(4)]) def test_revoke_partitions(self, store): store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} From 199949c4476c3adbab4c0ce594d37370610b7a56 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 16:25:11 +0100 Subject: [PATCH 608/616] fill startup cache after recovery This ensures that partition assignments are fast adn recovery loops are not possible Also the startup cache is removed from the apply_changelog_batch function --- faust/stores/bigtable.py | 20 ++++++++++++++------ tests/unit/stores/test_bigtable.py | 23 +++++++++++++++++++++-- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 941fbe73d..1a100e3af 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -519,10 +519,8 @@ def apply_changelog_batch( bt_key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: - self._del_cache(msg.key) self._bigtable_del(bt_key) else: - self._set_cache(msg.partition, msg.key, msg.value) self._bigtable_set(bt_key, msg.value) for tp, offset in tp_offsets.items(): @@ -593,9 +591,9 @@ async def assign_partitions( self.log.info(f"Assigning partitions {partitions} for {table.name}") if len(partitions) == 0 or self._startup_cache_enable is False: return + for partition in partitions: - self._fill_caches(partition) - await asyncio.sleep(0) + self._startup_cache[partition] = {} def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() @@ -603,6 +601,7 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if tp.topic in table.changelog_topic.topics: partitions.add(tp.partition) + self.log.info(f"Revoking partitions {partitions} for {table.name}") if len(partitions) == 0 or self._startup_cache_enable is False: return @@ -611,7 +610,6 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: self._startup_cache[partition].clear() del self._startup_cache[partition] gc.collect() - self.log.info(f"Revoking partitions {partitions} for {table.name}") async def on_rebalance( self, @@ -633,9 +631,19 @@ async def on_rebalance( async with self.db_lock: if len(revoked) > 0: self.revoke_partitions(self.table, revoked) - if len(assigned) > 0: + if len(newly_assigned) > 0: await self.assign_partitions(self.table, newly_assigned, generation_id) + async def on_recovery_completed( + self, active_tps: Set[TP], standby_tps: Set[TP] + ) -> None: + """Signal that table recovery completed.""" + partitions = {tp.partition for tp in active_tps} + for p in partitions: + # This also flushes to bigtable + self._fill_caches(p) + await asyncio.sleep(0) + async def stop(self) -> None: if self._mutation_batcher_enable: self.log.info("Flushing to bigtable on stop") diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index af6eaaff5..50f37bbf3 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -711,6 +711,7 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): revoked = {TP("topic3", 2)} newly_assigned = {TP("topic4", 3), TP("topic5", 4)} store._startup_cache_enable = False + store._startup_cache = None await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=1) store.assign_partitions.assert_called_once_with(store.table, newly_assigned, 1) store.revoke_partitions.assert_called_once_with(store.table, revoked) @@ -719,15 +720,18 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): # Test with empty newly_assigned store._startup_cache_enable = True + store._startup_cache = {} + store.assign_partitions.reset_mock() await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=2) - store.assign_partitions.assert_called_with(store.table, newly_assigned, 2) + store.assign_partitions.assert_not_called() store._fill_caches.assert_not_called() store._startup_cache_enable = True + store._startup_cache = {} newly_assigned = {TP("topic4", 3), TP("topic5", 4)} await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) - store._fill_caches.assert_has_calls([call(3), call(4)]) + assert set(store._startup_cache.keys()) == {3, 4} def test_revoke_partitions(self, store): store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} @@ -888,3 +892,18 @@ def test_get_after_delete(self, store, bt_imports): res = store._get(self.TEST_KEY1) assert res is not None + + @pytest.mark.asyncio + async def test_on_recovery_completed(self, store, bt_imports): + store._mutation_batcher_enable = True + store._mutation_batcher = MagicMock(flush=MagicMock()) + + store._startup_cache_enable = True + store._invalidation_timer = None + store._startup_cache = {} + + active_tps = {TP("topic4", 3), TP("topic5", 4)} + standby_tps = {} + await store.on_recovery_completed(active_tps, standby_tps) + assert store._mutation_batcher.flush.call_count == 2 # once for every partition + assert set(store._startup_cache.keys()) == {3, 4} From 2e7730ed016cbfa27c9504b2dccac87160068dfd Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 17:49:28 +0100 Subject: [PATCH 609/616] made some adjustments to the startup cache and fixed tests accordingly --- faust/stores/bigtable.py | 60 +++++++++++++++++------------- tests/unit/stores/test_bigtable.py | 9 ++++- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 1a100e3af..94cd2e6ef 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -127,11 +127,11 @@ def _setup_caches( BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY, False ) - self._startup_cache = None self._startup_cache_ttl = options.get( BigTableStore.BT_STARTUP_CACHE_TTL_KEY, -1 ) if self._startup_cache_enable: + self._startup_cache_partitions = set() self._startup_cache: Dict[int, Dict[bytes, bytes]] = {} self._invalidation_timer: Optional[threading.Timer] = None @@ -231,20 +231,25 @@ def bigtable_extract_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _del_cache(self, key: bytes): - if self._startup_cache is not None: - for partition in self._startup_cache: - self._startup_cache[partition][key] = None + if not self._startup_cache_enable: + return + + for partition in self._startup_cache_partitions: + self._startup_cache[partition][key] = None def _set_cache(self, partition: int, key: bytes, value): - if self._startup_cache is not None: - self._startup_cache[partition][key] = value + if not self._startup_cache_enable: + return + self._startup_cache[partition][key] = value def _get_cache(self, partition: int, key: bytes): - if self._startup_cache_enable and self._startup_cache is not None: - if partition not in self._startup_cache: - return None, False - if key in self._startup_cache[partition]: - return self._startup_cache[partition][key], True + if not self._startup_cache_enable: + return None, False + + if partition not in self._startup_cache_partitions: + return None, False + if key in self._startup_cache[partition]: + return self._startup_cache[partition][key], True return None, False def _invalidate_startup_cache(self): @@ -252,6 +257,7 @@ def _invalidate_startup_cache(self): for partition in self._startup_cache: self._startup_cache[partition].clear() self._startup_cache.clear() + self._startup_cache_partitions = set() gc.collect() self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() @@ -288,10 +294,10 @@ def _get(self, key: bytes) -> Optional[bytes]: for partition in partitions: value, found = self._get_cache(partition, key) if found: + # We only trust the value cache if we + # find something return value - # Remove partitions that we already have in cache - partitions.difference_update(self._startup_cache or {}) - # Nothing todo + if len(partitions) == 0: return None @@ -407,15 +413,16 @@ def _bigtable_iteritems(self, partitions): def _iteritems( self, partitions: Optional[List[int]] = None ) -> Iterator[Tuple[bytes, bytes]]: - if self._startup_cache is not None: - if partitions is None: - active_partitions = set(self._active_partitions()) - cache_partitions = active_partitions.intersection(self._startup_cache) - for p in cache_partitions: - for k, v in self._startup_cache[p].items(): - if v is not None: - yield k, v - partitions = list(active_partitions.difference(cache_partitions)) + if self._startup_cache_enable and partitions is None: + active_partitions = set(self._active_partitions()) + cache_partitions = active_partitions.intersection( + self._startup_cache_partitions + ) + for p in cache_partitions: + for k, v in self._startup_cache[p].items(): + if v is not None: + yield k, v + partitions = list(active_partitions.difference(cache_partitions)) if partitions is None or len(partitions) > 0: yield from self._bigtable_iteritems(partitions) @@ -553,6 +560,7 @@ def restore_backup( def _fill_caches(self, partition): if partition in self._startup_cache: return + self._startup_cache[partition] = {} for k, v in self._bigtable_iteritems(partitions={partition}): self._set_cache(partition, k, v) @@ -592,8 +600,7 @@ async def assign_partitions( if len(partitions) == 0 or self._startup_cache_enable is False: return - for partition in partitions: - self._startup_cache[partition] = {} + self._startup_cache_partitions |= partitions def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() @@ -605,10 +612,11 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if len(partitions) == 0 or self._startup_cache_enable is False: return - for partition in partitions.intersection(self._startup_cache or {}): + for partition in partitions.intersection(self._startup_cache_partitions): if partition in self._startup_cache: self._startup_cache[partition].clear() del self._startup_cache[partition] + self._startup_cache_partitions = set(self._startup_cache.keys()) gc.collect() async def on_rebalance( diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 50f37bbf3..23f371ae2 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -478,6 +478,8 @@ def test_iteritems(self, store): def test_iteritems_with_startup_cache(self, store, bt_imports): store._active_partitions = MagicMock(return_value=[1, 3]) store._startup_cache = {} + store._startup_cache_enable = True + store._startup_cache_partitions = {1} store._startup_cache[1] = { self.TEST_KEY1: b"this is a value", self.TEST_KEY2: b"this is another value", @@ -720,6 +722,7 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): # Test with empty newly_assigned store._startup_cache_enable = True + store._startup_cache_partitions = {} store._startup_cache = {} store.assign_partitions.reset_mock() await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=2) @@ -731,7 +734,8 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): newly_assigned = {TP("topic4", 3), TP("topic5", 4)} await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) - assert set(store._startup_cache.keys()) == {3, 4} + assert set(store._startup_cache.keys()) == set() + assert store._startup_cache_partitions == {3, 4} def test_revoke_partitions(self, store): store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} @@ -772,7 +776,7 @@ def test_setup_caches_startup_cache_disable(self, store): store._setup_caches(options=options) assert store._startup_cache_enable is False assert store._startup_cache_ttl == -1 # Default value - assert store._startup_cache is None + assert hasattr(store, "_startup_cache") is False assert store._startup_cache_enable is False def test_set_del_get_cache(self, store): @@ -794,6 +798,7 @@ def test_set_del_get_cache(self, store): # Now with enabled startup cache store._startup_cache_enable = True + store._startup_cache_partitions = {partition} store._startup_cache = {} store._startup_cache[partition] = {} From 7b68855fc9c30163b34e02c2ae80443c836d90f8 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Tue, 19 Dec 2023 18:23:02 +0100 Subject: [PATCH 610/616] adjusted some more stuff --- faust/stores/bigtable.py | 34 +++++++++--------------------- tests/unit/stores/test_bigtable.py | 28 +++++++++++------------- 2 files changed, 22 insertions(+), 40 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 94cd2e6ef..ecddba188 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -131,7 +131,6 @@ def _setup_caches( BigTableStore.BT_STARTUP_CACHE_TTL_KEY, -1 ) if self._startup_cache_enable: - self._startup_cache_partitions = set() self._startup_cache: Dict[int, Dict[bytes, bytes]] = {} self._invalidation_timer: Optional[threading.Timer] = None @@ -234,7 +233,7 @@ def _del_cache(self, key: bytes): if not self._startup_cache_enable: return - for partition in self._startup_cache_partitions: + for partition in self._startup_cache: self._startup_cache[partition][key] = None def _set_cache(self, partition: int, key: bytes, value): @@ -246,7 +245,7 @@ def _get_cache(self, partition: int, key: bytes): if not self._startup_cache_enable: return None, False - if partition not in self._startup_cache_partitions: + if partition not in self._startup_cache: return None, False if key in self._startup_cache[partition]: return self._startup_cache[partition][key], True @@ -257,7 +256,6 @@ def _invalidate_startup_cache(self): for partition in self._startup_cache: self._startup_cache[partition].clear() self._startup_cache.clear() - self._startup_cache_partitions = set() gc.collect() self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() @@ -368,7 +366,6 @@ def _del(self, key: bytes) -> None: def _bigtable_iteritems(self, partitions): try: - start = time.time() if partitions is None: partitions = self._active_partitions() row_set = BT.RowSet() @@ -397,11 +394,6 @@ def _bigtable_iteritems(self, partitions): value = self.bigtable_extract_row_data(row) key = self._remove_partition_prefix_from_bigtable_key(row.row_key) yield key, value - end = time.time() - self.log.info( - f"{self.table_name} _bigtable_iteritems took {end - start}s " - f"for partitions {partitions}" - ) except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error " @@ -415,9 +407,7 @@ def _iteritems( ) -> Iterator[Tuple[bytes, bytes]]: if self._startup_cache_enable and partitions is None: active_partitions = set(self._active_partitions()) - cache_partitions = active_partitions.intersection( - self._startup_cache_partitions - ) + cache_partitions = active_partitions.intersection(self._startup_cache) for p in cache_partitions: for k, v in self._startup_cache[p].items(): if v is not None: @@ -558,10 +548,8 @@ def restore_backup( raise NotImplementedError("Not yet implemented for Bigtable.") def _fill_caches(self, partition): - if partition in self._startup_cache: - return - - self._startup_cache[partition] = {} + if partition not in self._startup_cache: + self._startup_cache[partition] = {} for k, v in self._bigtable_iteritems(partitions={partition}): self._set_cache(partition, k, v) @@ -597,10 +585,6 @@ async def assign_partitions( # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) self.log.info(f"Assigning partitions {partitions} for {table.name}") - if len(partitions) == 0 or self._startup_cache_enable is False: - return - - self._startup_cache_partitions |= partitions def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() @@ -612,11 +596,9 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if len(partitions) == 0 or self._startup_cache_enable is False: return - for partition in partitions.intersection(self._startup_cache_partitions): + for partition in partitions: if partition in self._startup_cache: - self._startup_cache[partition].clear() del self._startup_cache[partition] - self._startup_cache_partitions = set(self._startup_cache.keys()) gc.collect() async def on_rebalance( @@ -647,10 +629,14 @@ async def on_recovery_completed( ) -> None: """Signal that table recovery completed.""" partitions = {tp.partition for tp in active_tps} + if not self._startup_cache_enable: + return + self.log.info(f"Recovery: Filling caches with {partitions=}") for p in partitions: # This also flushes to bigtable self._fill_caches(p) await asyncio.sleep(0) + self.log.info(f"Recovery Completed. Filling caches done for {self.table.name}") async def stop(self) -> None: if self._mutation_batcher_enable: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 23f371ae2..4d809cfd5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,3 +1,4 @@ +from functools import wraps from unittest.mock import MagicMock, call, patch import pytest @@ -623,14 +624,15 @@ async def test_fill_caches(self, store, bt_imports): assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() - store._bigtable_iteritems.assert_called_with(partitions={2}) - assert store._bigtable_iteritems.call_count == 1 + store._bigtable_iteritems.assert_has_calls( + [call(partitions={2}), call(partitions={0})], any_order=True + ) store._set_cache.assert_has_calls( [ # Key 4 is ignored because it should already be loaded. # Because in our scenario the second key is never returned - # call(0, b"key4", b"value4"), - call(2, b"key3", b"value3"), + call(0, b"key3", b"value3"), + call(2, b"key4", b"value4"), ] ) assert store._invalidation_timer is not None @@ -722,21 +724,12 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): # Test with empty newly_assigned store._startup_cache_enable = True - store._startup_cache_partitions = {} store._startup_cache = {} store.assign_partitions.reset_mock() await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=2) store.assign_partitions.assert_not_called() store._fill_caches.assert_not_called() - store._startup_cache_enable = True - store._startup_cache = {} - newly_assigned = {TP("topic4", 3), TP("topic5", 4)} - await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) - store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) - assert set(store._startup_cache.keys()) == set() - assert store._startup_cache_partitions == {3, 4} - def test_revoke_partitions(self, store): store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} revoked = {TP("topic", 1), TP("topic", 2)} @@ -900,8 +893,9 @@ def test_get_after_delete(self, store, bt_imports): @pytest.mark.asyncio async def test_on_recovery_completed(self, store, bt_imports): - store._mutation_batcher_enable = True - store._mutation_batcher = MagicMock(flush=MagicMock()) + store._bigtable_iteritems = MagicMock( + side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] + ) store._startup_cache_enable = True store._invalidation_timer = None @@ -910,5 +904,7 @@ async def test_on_recovery_completed(self, store, bt_imports): active_tps = {TP("topic4", 3), TP("topic5", 4)} standby_tps = {} await store.on_recovery_completed(active_tps, standby_tps) - assert store._mutation_batcher.flush.call_count == 2 # once for every partition + store._bigtable_iteritems.assert_has_calls( + [call(partitions={3}), call(partitions={4})] + ) assert set(store._startup_cache.keys()) == {3, 4} From ecb1269f7b7ed6f3916de306c136259160d0e7c4 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 20 Dec 2023 13:18:29 +0100 Subject: [PATCH 611/616] create no existing partitions on assign in value cache --- faust/stores/bigtable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index ecddba188..2b0d52689 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -585,6 +585,10 @@ async def assign_partitions( # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) self.log.info(f"Assigning partitions {partitions} for {table.name}") + if self._startup_cache_enable: + for p in partitions: + if p not in self._startup_cache: + self._startup_cache[p] = {} def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() From 711b615cb8032736b161e624b0b1220e2082d186 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 20 Dec 2023 13:21:25 +0100 Subject: [PATCH 612/616] also create empty cache if partition is not in startup cache already --- faust/stores/bigtable.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2b0d52689..2f963ca7b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -239,6 +239,12 @@ def _del_cache(self, key: bytes): def _set_cache(self, partition: int, key: bytes, value): if not self._startup_cache_enable: return + if partition not in self._startup_cache: + self.log.warning( + f"Had to manually create partition {partition} for " + f"_startup_cache in table {self.table.name}" + ) + self._startup_cache[partition] = {} self._startup_cache[partition][key] = value def _get_cache(self, partition: int, key: bytes): From cb095f58e54e7af6654e93c06b70f126079c9860 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 20 Dec 2023 14:06:34 +0100 Subject: [PATCH 613/616] fixed bug --- faust/stores/bigtable.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 2f963ca7b..fe878704b 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -240,11 +240,9 @@ def _set_cache(self, partition: int, key: bytes, value): if not self._startup_cache_enable: return if partition not in self._startup_cache: - self.log.warning( - f"Had to manually create partition {partition} for " - f"_startup_cache in table {self.table.name}" - ) - self._startup_cache[partition] = {} + # This means, that the startup cache is not activated + # for this partition + return self._startup_cache[partition][key] = value def _get_cache(self, partition: int, key: bytes): @@ -591,10 +589,6 @@ async def assign_partitions( # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) self.log.info(f"Assigning partitions {partitions} for {table.name}") - if self._startup_cache_enable: - for p in partitions: - if p not in self._startup_cache: - self._startup_cache[p] = {} def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() From 2d338e9aed0a8710cb09e297c516e1f0f676325e Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Thu, 21 Dec 2023 10:41:58 +0100 Subject: [PATCH 614/616] removed log --- faust/stores/bigtable.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index fe878704b..868500980 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -373,10 +373,6 @@ def _bigtable_iteritems(self, partitions): if partitions is None: partitions = self._active_partitions() row_set = BT.RowSet() - self.log.info( - f"BigtableStore: Iterating over {len(partitions)} partitions " - f"for table {self.table_name}" - ) need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: From 266127090b6e2a15ff0befe5863bc5f74b2ddb11 Mon Sep 17 00:00:00 2001 From: Johannes Pesenhofer Date: Wed, 17 Jan 2024 14:09:27 +0100 Subject: [PATCH 615/616] Revert "Merge pull request #14 from smaxtec/faster-bigtable-startup-revokes" This reverts commit 8092517e0a003ca9350eb72c6accb492be1b2bf4, reversing changes made to 5db83dc04a26a2467497c281bbc5ffb16009f53c. --- faust/stores/bigtable.py | 131 +++++++++++------------- tests/unit/stores/test_bigtable.py | 154 ++++++++++++----------------- 2 files changed, 119 insertions(+), 166 deletions(-) diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 868500980..6b88abbbc 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,5 +1,4 @@ """BigTable storage.""" -import asyncio import gc import logging import time @@ -83,8 +82,6 @@ def __init__( **kwargs: Any, ) -> None: self._set_options(options) - self.db_lock = asyncio.Lock() - self.rebalance_ack = False try: self._setup_bigtable(table, options) self._setup_caches(options) @@ -127,11 +124,13 @@ def _setup_caches( BigTableStore.BT_STARTUP_CACHE_ENABLE_KEY, False ) + self._startup_cache = None + self._startup_cache_partitions: Set[int] = set() self._startup_cache_ttl = options.get( BigTableStore.BT_STARTUP_CACHE_TTL_KEY, -1 ) if self._startup_cache_enable: - self._startup_cache: Dict[int, Dict[bytes, bytes]] = {} + self._startup_cache: Dict[bytes, bytes] = {} self._invalidation_timer: Optional[threading.Timer] = None def _set_options(self, options) -> None: @@ -230,36 +229,23 @@ def bigtable_extract_row_data(row_data): return list(row_data.to_dict().values())[0][0].value def _del_cache(self, key: bytes): - if not self._startup_cache_enable: - return - - for partition in self._startup_cache: - self._startup_cache[partition][key] = None - - def _set_cache(self, partition: int, key: bytes, value): - if not self._startup_cache_enable: - return - if partition not in self._startup_cache: - # This means, that the startup cache is not activated - # for this partition - return - self._startup_cache[partition][key] = value + if self._startup_cache is not None: + self._startup_cache[key] = None - def _get_cache(self, partition: int, key: bytes): - if not self._startup_cache_enable: - return None, False + def _set_cache(self, key: bytes, value): + if self._startup_cache is not None: + self._startup_cache[key] = value - if partition not in self._startup_cache: - return None, False - if key in self._startup_cache[partition]: - return self._startup_cache[partition][key], True + def _get_cache(self, key: bytes): + if self._startup_cache_enable and self._startup_cache is not None: + if key in self._startup_cache: + return self._startup_cache[key], True return None, False def _invalidate_startup_cache(self): if self._startup_cache is not None: - for partition in self._startup_cache: - self._startup_cache[partition].clear() self._startup_cache.clear() + self._startup_cache_partitions = set() gc.collect() self.log.info(f"Invalidated startup cache for table {self.table_name}") self._invalidation_timer.cancel() @@ -292,14 +278,14 @@ def _bigtable_get(self, keys: List[bytes]) -> Tuple[Optional[bytes], Optional[in def _get(self, key: bytes) -> Optional[bytes]: try: - partitions = set(self._get_partitions_for_key(key)) - for partition in partitions: - value, found = self._get_cache(partition, key) - if found: - # We only trust the value cache if we - # find something - return value + value, found = self._get_cache(key) + if found: + return value + partitions = set(self._get_partitions_for_key(key)) + # Remove partitions that we already have in cache + partitions.difference_update(self._startup_cache_partitions) + # Nothing todo if len(partitions) == 0: return None @@ -329,10 +315,11 @@ def _bigtable_set(self, key: bytes, value: bytes): def _set(self, key: bytes, value: bytes) -> None: try: + self._set_cache(key, value) + event = current_event() assert event is not None partition = event.message.partition - self._set_cache(partition, key, value) key = self._add_partition_prefix_to_key(key, partition) self._bigtable_set(key, value) @@ -370,9 +357,14 @@ def _del(self, key: bytes) -> None: def _bigtable_iteritems(self, partitions): try: + start = time.time() if partitions is None: partitions = self._active_partitions() row_set = BT.RowSet() + self.log.info( + f"BigtableStore: Iterating over {len(partitions)} partitions " + f"for table {self.table_name}" + ) need_all_keys = self.table.is_global or self.table.use_partitioner if not need_all_keys: @@ -394,6 +386,11 @@ def _bigtable_iteritems(self, partitions): value = self.bigtable_extract_row_data(row) key = self._remove_partition_prefix_from_bigtable_key(row.row_key) yield key, value + end = time.time() + self.log.info( + f"{self.table_name} _bigtable_iteritems took {end - start}s " + f"for partitions {partitions}" + ) except Exception as ex: # pragma: no cover self.log.error( f"FaustBigtableException Error " @@ -405,14 +402,13 @@ def _bigtable_iteritems(self, partitions): def _iteritems( self, partitions: Optional[List[int]] = None ) -> Iterator[Tuple[bytes, bytes]]: - if self._startup_cache_enable and partitions is None: - active_partitions = set(self._active_partitions()) - cache_partitions = active_partitions.intersection(self._startup_cache) - for p in cache_partitions: - for k, v in self._startup_cache[p].items(): + if self._startup_cache is not None: + if partitions is None: + partitions = set(self._active_partitions()) + for k, v in self._startup_cache.items(): if v is not None: yield k, v - partitions = list(active_partitions.difference(cache_partitions)) + partitions.difference_update(self._startup_cache_partitions) if partitions is None or len(partitions) > 0: yield from self._bigtable_iteritems(partitions) @@ -516,8 +512,10 @@ def apply_changelog_batch( bt_key = self._add_partition_prefix_to_key(msg.key, msg.partition) if msg.value is None: + self._del_cache(msg.key) self._bigtable_del(bt_key) else: + self._set_cache(msg.key, msg.value) self._bigtable_set(bt_key, msg.value) for tp, offset in tp_offsets.items(): @@ -547,12 +545,11 @@ def restore_backup( """ raise NotImplementedError("Not yet implemented for Bigtable.") - def _fill_caches(self, partition): - if partition not in self._startup_cache: - self._startup_cache[partition] = {} - for k, v in self._bigtable_iteritems(partitions={partition}): - self._set_cache(partition, k, v) + def _fill_caches(self, partitions): + for k, v in self._bigtable_iteritems(partitions=partitions): + self._set_cache(k, v) + self._startup_cache_partitions |= set(partitions) # Invalidate startup cache after self._startup_cache_ttl # or reset the timer if already running if self._invalidation_timer is not None: @@ -574,17 +571,22 @@ def _get_active_changelogtopic_partitions( standby_tps = self.app.assignor.assigned_standbys() my_topics = table.changelog_topic.topics for tp in tps: - if tp.topic in my_topics and tp not in standby_tps and self.rebalance_ack: + if tp.topic in my_topics and tp not in standby_tps: partitions.add(tp.partition) return partitions async def assign_partitions( self, table: CollectionT, tps: Set[TP], generation_id: int = 0 ) -> None: - self.rebalance_ack = True # Fill cache with all keys for the partitions we are assigned partitions = self._get_active_changelogtopic_partitions(table, tps) + if self._startup_cache_enable is False: + return + + if len(partitions) == 0: + return self.log.info(f"Assigning partitions {partitions} for {table.name}") + self._fill_caches(partitions) def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: partitions = set() @@ -592,14 +594,13 @@ def revoke_partitions(self, table: CollectionT, tps: Set[TP]) -> None: if tp.topic in table.changelog_topic.topics: partitions.add(tp.partition) - self.log.info(f"Revoking partitions {partitions} for {table.name}") - if len(partitions) == 0 or self._startup_cache_enable is False: + if len(partitions) == 0: return - for partition in partitions: - if partition in self._startup_cache: - del self._startup_cache[partition] - gc.collect() + self._startup_cache_partitions.difference_update(partitions) + # The memory of the startup cache will be freed after the ttl is over + # TODO: Free memory that is not needed instantly + self.log.info(f"Revoking partitions {partitions} for {table.name}") async def on_rebalance( self, @@ -617,26 +618,8 @@ async def on_rebalance( for which we were not assigned the last time. generation_id: the metadata generation identifier for the re-balance """ - self.rebalance_ack = False - async with self.db_lock: - if len(revoked) > 0: - self.revoke_partitions(self.table, revoked) - if len(newly_assigned) > 0: - await self.assign_partitions(self.table, newly_assigned, generation_id) - - async def on_recovery_completed( - self, active_tps: Set[TP], standby_tps: Set[TP] - ) -> None: - """Signal that table recovery completed.""" - partitions = {tp.partition for tp in active_tps} - if not self._startup_cache_enable: - return - self.log.info(f"Recovery: Filling caches with {partitions=}") - for p in partitions: - # This also flushes to bigtable - self._fill_caches(p) - await asyncio.sleep(0) - self.log.info(f"Recovery Completed. Filling caches done for {self.table.name}") + self.revoke_partitions(self.table, revoked) + await self.assign_partitions(self.table, newly_assigned, generation_id) async def stop(self) -> None: if self._mutation_batcher_enable: diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 4d809cfd5..97790e6d5 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -1,4 +1,3 @@ -from functools import wraps from unittest.mock import MagicMock, call, patch import pytest @@ -401,10 +400,11 @@ def test_get_with_known_partition(self, store): assert res is None # Scenario: Cache miss, but partition should be in startup cache + store._startup_cache_partitions = {19, 20} store._get_cache = MagicMock(return_value=(None, False)) store._bigtable_get = MagicMock(return_value=(None, None)) res = store._get(self.TEST_KEY2) - store._bigtable_get.assert_called_once() + store._bigtable_get.assert_not_called() assert res is None def test_set(self, store): @@ -427,7 +427,7 @@ def test_set(self, store): store._set(self.TEST_KEY1, b"a_value") key = store._add_partition_prefix_to_key(self.TEST_KEY1, 69) - store._set_cache.assert_called_with(69, self.TEST_KEY1, b"a_value") + store._set_cache.assert_called_with(self.TEST_KEY1, b"a_value") store._bigtable_set.assert_called_once_with(key, b"a_value") def test_del(self, store): @@ -478,14 +478,12 @@ def test_iteritems(self, store): def test_iteritems_with_startup_cache(self, store, bt_imports): store._active_partitions = MagicMock(return_value=[1, 3]) - store._startup_cache = {} - store._startup_cache_enable = True - store._startup_cache_partitions = {1} - store._startup_cache[1] = { + store._startup_cache = { self.TEST_KEY1: b"this is a value", self.TEST_KEY2: b"this is another value", - b"Dont return this, because this is a offset key": None, + b"Dont return this": None, } + store._startup_cache_partitions = [1] store._bigtable_iteritems = MagicMock(wraps=store._bigtable_iteritems) store.bt_table.read_rows = MagicMock( @@ -503,7 +501,7 @@ def test_iteritems_with_startup_cache(self, store, bt_imports): ] ) res = sorted(store._iteritems()) - store._bigtable_iteritems.assert_called_once_with([3]) + store._bigtable_iteritems.assert_called_once_with({3}) all_entries = { self.TEST_KEY1: b"this is a value", self.TEST_KEY2: b"this is another value", @@ -584,27 +582,25 @@ def __init__(self, message): @pytest.mark.asyncio async def test_fill_caches(self, store, bt_imports): store._bigtable_iteritems = MagicMock( - side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] + return_value=[(b"key1", b"value1"), (b"key2", b"value2")] ) store._set_cache = MagicMock() store._startup_cache_ttl = 1800 store._invalidation_timer = None + store._startup_cache_partitions = set() store._startup_cache = {} - partitions = {0, 1} - partitions2 = {0, 2} + partitions = {TP("topic", 0), TP("topic", 1)} + partitions2 = {TP("topic", 0), TP("topic", 2)} - for partition in partitions: - store._fill_caches(partition) - calls = [call(partitions={p}) for p in partitions] - store._bigtable_iteritems.assert_has_calls(calls) + store._fill_caches(partitions) - store._set_cache.assert_has_calls( - [ - call(0, b"key1", b"value1"), - call(1, b"key2", b"value2"), - ] - ) + assert store._bigtable_iteritems.call_args == call(partitions=partitions) + assert store._set_cache.call_args_list == [ + call(b"key1", b"value1"), + call(b"key2", b"value2"), + ] + assert store._startup_cache_partitions == partitions assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() @@ -613,28 +609,22 @@ async def test_fill_caches(self, store, bt_imports): old_invalid_timer = store._invalidation_timer.__hash__() store._bigtable_iteritems = MagicMock( - side_effect=[[(b"key3", b"value3")], [(b"key4", b"value4")]] + return_value=[(b"key3", b"value3"), (b"key4", b"value4")] ) store._set_cache = MagicMock() - for p in partitions2: - store._fill_caches(p) + store._fill_caches(partitions2) new_invalid_timer = store._invalidation_timer.__hash__() # Check if old invalidation timer is different from new one assert old_invalid_timer != new_invalid_timer assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() - store._bigtable_iteritems.assert_has_calls( - [call(partitions={2}), call(partitions={0})], any_order=True - ) - store._set_cache.assert_has_calls( - [ - # Key 4 is ignored because it should already be loaded. - # Because in our scenario the second key is never returned - call(0, b"key3", b"value3"), - call(2, b"key4", b"value4"), - ] - ) + assert store._bigtable_iteritems.call_args == call(partitions=partitions2) + assert store._set_cache.call_args_list == [ + call(b"key3", b"value3"), + call(b"key4", b"value4"), + ] + assert store._startup_cache_partitions == partitions | partitions2 assert store._invalidation_timer is not None assert store._invalidation_timer.is_alive() @@ -643,33 +633,31 @@ async def test_fill_caches(self, store, bt_imports): store._invalidate_startup_cache() assert store._startup_cache == {} + assert store._startup_cache_partitions == set() assert store._invalidation_timer is None @pytest.mark.asyncio async def test_fill_caches_no_ttl(self, store, bt_imports): store._bigtable_iteritems = MagicMock( - side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] + return_value=[(b"key1", b"value1"), (b"key2", b"value2")] ) store._set_cache = MagicMock() store._startup_cache_ttl = 0 store._invalidation_timer = None + store._startup_cache_partitions = set() store._startup_cache = {} - partitions = {0, 1} + partitions = {TP("topic", 0), TP("topic", 1)} + partitions2 = {TP("topic", 0), TP("topic", 2)} - for p in partitions: - store._fill_caches(p) + store._fill_caches(partitions) - store._set_cache.assert_has_calls( - [ - call(0, b"key1", b"value1"), - call(1, b"key2", b"value2"), - ] - ) + assert store._bigtable_iteritems.call_args == call(partitions=partitions) assert store._set_cache.call_args_list == [ - call(0, b"key1", b"value1"), - call(1, b"key2", b"value2"), + call(b"key1", b"value1"), + call(b"key2", b"value2"), ] + assert store._startup_cache_partitions == partitions assert store._invalidation_timer is None @pytest.mark.asyncio @@ -682,15 +670,6 @@ async def test__get_active_changelogtopic_partitions(self, store): store.table = MagicMock(changelog_topic=MagicMock(topics=tps_table)) tps = {TP("changelog_topic", 0), TP("other_topic", 1)} - # Scenario 1: With no rebalance_ack - store.rebalance_ack = False - active_partitions = store._get_active_changelogtopic_partitions( - store.table, tps - ) - assert active_partitions == set() - - # Scenario 2: With no rebalance_ack - store.rebalance_ack = True active_partitions = store._get_active_changelogtopic_partitions( store.table, tps ) @@ -715,7 +694,6 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): revoked = {TP("topic3", 2)} newly_assigned = {TP("topic4", 3), TP("topic5", 4)} store._startup_cache_enable = False - store._startup_cache = None await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=1) store.assign_partitions.assert_called_once_with(store.table, newly_assigned, 1) store.revoke_partitions.assert_called_once_with(store.table, revoked) @@ -724,17 +702,23 @@ async def test_bigtable_on_rebalance(self, store, bt_imports): # Test with empty newly_assigned store._startup_cache_enable = True - store._startup_cache = {} - store.assign_partitions.reset_mock() await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=2) - store.assign_partitions.assert_not_called() + store.assign_partitions.assert_called_with(store.table, newly_assigned, 2) store._fill_caches.assert_not_called() + store._startup_cache_enable = True + newly_assigned = {TP("topic4", 3), TP("topic5", 4)} + await store.on_rebalance(assigned, revoked, newly_assigned, generation_id=3) + store.assign_partitions.assert_called_with(store.table, newly_assigned, 3) + store._fill_caches.assert_called_once_with({3, 4}) + def test_revoke_partitions(self, store): + store._startup_cache_partitions = {1, 2, 3} store._startup_cache = {b"key1": b"value1", b"key2": b"value2"} revoked = {TP("topic", 1), TP("topic", 2)} store.table = MagicMock(changelog_topic=MagicMock(topics={"topic"})) store.revoke_partitions(store.table, revoked) + assert store._startup_cache_partitions == {3} def test_contains(self, store, bt_imports): store._get = MagicMock(return_value=b"test_value") @@ -760,6 +744,7 @@ def test_setup_caches_startup_cache_enable(self, store): assert store._startup_cache_enable is True assert store._startup_cache_ttl == 60 assert isinstance(store._startup_cache, dict) + assert isinstance(store._startup_cache_partitions, set) assert store._invalidation_timer is None def test_setup_caches_startup_cache_disable(self, store): @@ -769,40 +754,43 @@ def test_setup_caches_startup_cache_disable(self, store): store._setup_caches(options=options) assert store._startup_cache_enable is False assert store._startup_cache_ttl == -1 # Default value - assert hasattr(store, "_startup_cache") is False + assert store._startup_cache is None + assert store._startup_cache_partitions == set() assert store._startup_cache_enable is False def test_set_del_get_cache(self, store): store._startup_cache_enable = False store._startup_cache = None - partition = 1 + store._startup_cache_partitions = set() key = self.TEST_KEY1 - store._set_cache(partition, key, b"123") - res = store._get_cache(partition, key) + store._set_cache(key, b"123") + res = store._get_cache(key) assert store._startup_cache is None + assert store._startup_cache_partitions == set() assert res == (None, False) store._del_cache(key) - res = store._get_cache(partition, key) + res = store._get_cache(key) assert res == (None, False) assert store._startup_cache is None + assert store._startup_cache_partitions == set() # Now with enabled startup cache store._startup_cache_enable = True - store._startup_cache_partitions = {partition} store._startup_cache = {} - store._startup_cache[partition] = {} + store._startup_cache_partitions = {1, 2} - store._set_cache(partition, key, b"123") - res = store._get_cache(partition, key) - assert partition in store._startup_cache - assert store._startup_cache[partition] == {key: b"123"} + store._set_cache(key, b"123") + res = store._get_cache(key) + assert store._startup_cache == {key: b"123"} + assert store._startup_cache_partitions == {1, 2} assert res == (b"123", True) store._del_cache(key) - res = store._get_cache(partition, key) - assert store._startup_cache[partition] == {key: None} + res = store._get_cache(key) + assert store._startup_cache == {key: None} + assert store._startup_cache_partitions == {1, 2} assert res == (None, True) def test_persisted_offset(self, store): @@ -890,21 +878,3 @@ def test_get_after_delete(self, store, bt_imports): res = store._get(self.TEST_KEY1) assert res is not None - - @pytest.mark.asyncio - async def test_on_recovery_completed(self, store, bt_imports): - store._bigtable_iteritems = MagicMock( - side_effect=[[(b"key1", b"value1")], [(b"key2", b"value2")]] - ) - - store._startup_cache_enable = True - store._invalidation_timer = None - store._startup_cache = {} - - active_tps = {TP("topic4", 3), TP("topic5", 4)} - standby_tps = {} - await store.on_recovery_completed(active_tps, standby_tps) - store._bigtable_iteritems.assert_has_calls( - [call(partitions={3}), call(partitions={4})] - ) - assert set(store._startup_cache.keys()) == {3, 4} From e760baf95272f992a9c335a74afd0574192da6ee Mon Sep 17 00:00:00 2001 From: Marco Moser Date: Thu, 5 Mar 2026 12:17:07 +0100 Subject: [PATCH 616/616] Fix bigtable tests and aiokafka producer compatibility --- faust/__init__.py | 1 + faust/stores/aerospike.py | 10 ++--- faust/stores/bigtable.py | 7 +++- faust/transport/drivers/aiokafka.py | 13 +++++- tests/unit/stores/test_bigtable.py | 14 +++---- tests/unit/transport/drivers/test_aiokafka.py | 41 +++++++++++-------- 6 files changed, 52 insertions(+), 34 deletions(-) diff --git a/faust/__init__.py b/faust/__init__.py index c20b05903..f3ef14c75 100644 --- a/faust/__init__.py +++ b/faust/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """Python Stream processing.""" + # :copyright: (c) 2017-2020, Robinhood Markets, Inc. # All rights reserved. # :license: BSD (3 Clause), see LICENSE for more details. diff --git a/faust/stores/aerospike.py b/faust/stores/aerospike.py index 291ccbe9f..198f78dc7 100644 --- a/faust/stores/aerospike.py +++ b/faust/stores/aerospike.py @@ -98,7 +98,7 @@ def _get(self, key: bytes) -> Optional[bytes]: key = (self.namespace, self.table_name, key) fun = self.client.get try: - (key, meta, bins) = self.aerospike_fun_call_with_retry(fun=fun, key=key) + key, meta, bins = self.aerospike_fun_call_with_retry(fun=fun, key=key) if bins: return bins[self.BIN_KEY] return None @@ -173,7 +173,7 @@ def _itervalues(self) -> Iterator[bytes]: fun=fun, namespace=self.namespace, set=self.table_name ) for result in scan.results(): - (key, meta, bins) = result + key, meta, bins = result if bins: yield bins[self.BIN_KEY] else: @@ -193,8 +193,8 @@ def _iteritems(self) -> Iterator[Tuple[bytes, bytes]]: fun=fun, namespace=self.namespace, set=self.table_name ) for result in scan.results(): - (key_data, meta, bins) = result - (ns, set, policy, key) = key_data + key_data, meta, bins = result + ns, set, policy, key = key_data if bins: bins = bins[self.BIN_KEY] @@ -214,7 +214,7 @@ def _contains(self, key: bytes) -> bool: try: if self.app.conf.store_check_exists: key = (self.namespace, self.table_name, key) - (key, meta) = self.aerospike_fun_call_with_retry( + key, meta = self.aerospike_fun_call_with_retry( fun=self.client.exists, key=key ) if meta: diff --git a/faust/stores/bigtable.py b/faust/stores/bigtable.py index 6b88abbbc..ded7ed76d 100644 --- a/faust/stores/bigtable.py +++ b/faust/stores/bigtable.py @@ -1,8 +1,11 @@ """BigTable storage.""" + +from __future__ import annotations + import gc import logging -import time import threading +import time import traceback from typing import ( Any, @@ -16,6 +19,7 @@ Tuple, Union, ) + from mode.utils.collections import LRUCache try: # pragma: no cover @@ -50,7 +54,6 @@ class BT: from faust.streams import current_event from faust.types import TP, AppT, CollectionT, EventT - COLUMN_FAMILY_ID = "FaustColumnFamily" COLUMN_NAME = "DATA" diff --git a/faust/transport/drivers/aiokafka.py b/faust/transport/drivers/aiokafka.py index 8da59e5f6..4bc23c5a8 100644 --- a/faust/transport/drivers/aiokafka.py +++ b/faust/transport/drivers/aiokafka.py @@ -1,6 +1,7 @@ """Message transport using :pypi:`aiokafka`.""" import asyncio +import inspect import typing from asyncio import Lock, QueueEmpty from collections import deque @@ -1111,7 +1112,7 @@ def __post_init__(self) -> None: def _settings_default(self) -> Mapping[str, Any]: transport = cast(Transport, self.transport) - return { + settings = { "bootstrap_servers": server_list(transport.url, transport.default_port), "client_id": self.client_id, "acks": self.acks, @@ -1122,10 +1123,18 @@ def _settings_default(self) -> Mapping[str, Any]: "security_protocol": "SSL" if self.ssl_context else "PLAINTEXT", "partitioner": self.partitioner, "request_timeout_ms": int(self.request_timeout * 1000), - "api_version": self._api_version, "metadata_max_age_ms": self.app.conf.producer_metadata_max_age_ms, "connections_max_idle_ms": self.app.conf.producer_connections_max_idle_ms, } + if self._producer_accepts_api_version(): + settings["api_version"] = self._api_version + return settings + + def _producer_accepts_api_version(self) -> bool: + return ( + "api_version" + in inspect.signature(aiokafka.AIOKafkaProducer.__init__).parameters + ) def _settings_auth(self) -> Mapping[str, Any]: return credentials_to_aiokafka_auth(self.credentials, self.ssl_context) diff --git a/tests/unit/stores/test_bigtable.py b/tests/unit/stores/test_bigtable.py index 97790e6d5..23ee050a0 100644 --- a/tests/unit/stores/test_bigtable.py +++ b/tests/unit/stores/test_bigtable.py @@ -508,12 +508,12 @@ def test_iteritems_with_startup_cache(self, store, bt_imports): self.TEST_KEY3: b"1", self.TEST_KEY4: b"2", } - assert res == sorted(list(all_entries.items())) - keys = list(sorted(store._iterkeys())) - values = list(sorted(store._itervalues())) + assert res == sorted(all_entries.items()) + keys = sorted(store._iterkeys()) + values = sorted(store._itervalues()) - assert keys == sorted(list(all_entries.keys())) - assert values == sorted(list(all_entries.values())) + assert keys == sorted(all_entries.keys()) + assert values == sorted(all_entries.values()) def test_iterkeys(self, store): values = [("K1", "V1"), ("K2", "V2")] @@ -540,7 +540,7 @@ def test_set_persisted_offset(self, store): store._bigtable_set = MagicMock() store.set_persisted_offset(tp, 123) - store._bigtable_set.called_once_with(expected_offset_key, b"123") + store._bigtable_set.assert_called_once_with(expected_offset_key, b"123") def test_apply_changelog_batch(self, store): row_mock = MagicMock() @@ -592,7 +592,6 @@ async def test_fill_caches(self, store, bt_imports): partitions = {TP("topic", 0), TP("topic", 1)} partitions2 = {TP("topic", 0), TP("topic", 2)} - store._fill_caches(partitions) assert store._bigtable_iteritems.call_args == call(partitions=partitions) @@ -648,7 +647,6 @@ async def test_fill_caches_no_ttl(self, store, bt_imports): store._startup_cache = {} partitions = {TP("topic", 0), TP("topic", 1)} - partitions2 = {TP("topic", 0), TP("topic", 2)} store._fill_caches(partitions) diff --git a/tests/unit/transport/drivers/test_aiokafka.py b/tests/unit/transport/drivers/test_aiokafka.py index 35fe87ba4..aec4dd1ad 100644 --- a/tests/unit/transport/drivers/test_aiokafka.py +++ b/tests/unit/transport/drivers/test_aiokafka.py @@ -1,3 +1,4 @@ +import inspect import random import string from contextlib import contextmanager @@ -1379,26 +1380,32 @@ def assert_new_producer( security_protocol="PLAINTEXT", **kwargs, ): + expected_kwargs = dict( + bootstrap_servers=bootstrap_servers, + client_id=client_id, + acks=acks, + linger_ms=linger_ms, + max_batch_size=max_batch_size, + max_request_size=max_request_size, + compression_type=compression_type, + security_protocol=security_protocol, + partitioner=producer.partitioner, + transactional_id=None, + metadata_max_age_ms=metadata_max_age_ms, + connections_max_idle_ms=connections_max_idle_ms, + request_timeout_ms=request_timeout_ms, + **kwargs, + ) + if ( + "api_version" + in inspect.signature(aiokafka.AIOKafkaProducer.__init__).parameters + ): + expected_kwargs["api_version"] = api_version + with patch("aiokafka.AIOKafkaProducer") as AIOKafkaProducer: p = producer._new_producer() assert p is AIOKafkaProducer.return_value - AIOKafkaProducer.assert_called_once_with( - bootstrap_servers=bootstrap_servers, - client_id=client_id, - acks=acks, - linger_ms=linger_ms, - max_batch_size=max_batch_size, - max_request_size=max_request_size, - compression_type=compression_type, - security_protocol=security_protocol, - partitioner=producer.partitioner, - transactional_id=None, - api_version=api_version, - metadata_max_age_ms=metadata_max_age_ms, - connections_max_idle_ms=connections_max_idle_ms, - request_timeout_ms=request_timeout_ms, - **kwargs, - ) + AIOKafkaProducer.assert_called_once_with(**expected_kwargs) class TestProducer(ProducerBaseTest):