Package astrapy
Expand source code
# Copyright DataStax, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
import importlib.metadata
import os
import toml
def get_version() -> str:
try:
# Poetry will create a __version__ attribute in the package's __init__.py file
return importlib.metadata.version(__package__)
# If the package is not installed, we can still get the version from the pyproject.toml file
except importlib.metadata.PackageNotFoundError:
# Get the path to the pyproject.toml file
dir_path = os.path.dirname(os.path.realpath(__file__))
pyproject_path = os.path.join(dir_path, "..", "pyproject.toml")
# Read the pyproject.toml file and get the version from the poetry section
try:
with open(pyproject_path, encoding="utf-8") as pyproject:
# Load the pyproject.toml file as a dictionary
file_contents = pyproject.read()
pyproject_data = toml.loads(file_contents)
# Return the version from the poetry section
return str(pyproject_data["tool"]["poetry"]["version"])
# If the pyproject.toml file does not exist or the version is not found, return unknown
except (FileNotFoundError, KeyError):
return "unknown"
__version__: str = get_version()
from astrapy import api_options # noqa: E402, F401
from astrapy.admin import ( # noqa: E402
AstraDBAdmin,
AstraDBDatabaseAdmin,
DataAPIDatabaseAdmin,
)
from astrapy.client import DataAPIClient # noqa: E402
from astrapy.collection import AsyncCollection, Collection # noqa: E402
# A circular-import issue requires this to happen at the end of this module:
from astrapy.database import AsyncDatabase, Database # noqa: E402
from astrapy.table import AsyncTable, Table # noqa: E402
__all__ = [
"AstraDBAdmin",
"AstraDBDatabaseAdmin",
"AsyncCollection",
"AsyncDatabase",
"AsyncTable",
"Collection",
"Database",
"DataAPIClient",
"DataAPIDatabaseAdmin",
"Table",
"__version__",
]
__pdoc__ = {
"ids": False,
"settings": False,
}
Sub-modules
astrapy.admin
astrapy.api_options
astrapy.authentication
astrapy.client
astrapy.collection
astrapy.constants
astrapy.cursors
astrapy.data
astrapy.data_types
astrapy.database
astrapy.exceptions
astrapy.info
astrapy.results
astrapy.table
astrapy.utils
Classes
class AstraDBAdmin (*, api_options: FullAPIOptions)
-
An "admin" object, able to perform administrative tasks at the databases level, such as creating, listing or dropping databases.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_admin
of AstraDBClient.Args
api_options
- a complete specification of the API Options for this instance.
Example
>>> from astrapy import DataAPIClient >>> my_client = DataAPIClient("AstraCS:...") >>> my_astra_db_admin = my_client.get_admin() >>> database_list = my_astra_db_admin.list_databases() >>> len(database_list) 3 >>> database_list[2].id '01234567-...' >>> my_db_admin = my_astra_db_admin.get_database_admin("01234567-...") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'staging_keyspace']
Note
a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered.
Expand source code
class AstraDBAdmin: """ An "admin" object, able to perform administrative tasks at the databases level, such as creating, listing or dropping databases. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_admin` of AstraDBClient. Args: api_options: a complete specification of the API Options for this instance. Example: >>> from astrapy import DataAPIClient >>> my_client = DataAPIClient("AstraCS:...") >>> my_astra_db_admin = my_client.get_admin() >>> database_list = my_astra_db_admin.list_databases() >>> len(database_list) 3 >>> database_list[2].id '01234567-...' >>> my_db_admin = my_astra_db_admin.get_database_admin("01234567-...") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'staging_keyspace'] Note: a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered. """ def __init__( self, *, api_options: FullAPIOptions, ) -> None: if api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) self.api_options = api_options self._dev_ops_commander_headers: dict[str, str | None] if self.api_options.token: _token_str = self.api_options.token.get_token() self._dev_ops_commander_headers = { DEFAULT_DEV_OPS_AUTH_HEADER: f"{DEFAULT_DEV_OPS_AUTH_PREFIX}{_token_str}", **self.api_options.admin_additional_headers, } else: self._dev_ops_commander_headers = { **self.api_options.admin_additional_headers, } self._dev_ops_api_commander = self._get_dev_ops_api_commander() def __repr__(self) -> str: return f"{self.__class__.__name__}({self.api_options})" def __eq__(self, other: Any) -> bool: if isinstance(other, AstraDBAdmin): return all([self.api_options == other.api_options]) else: return False def _get_dev_ops_api_commander(self) -> APICommander: """Instantiate a new APICommander based on the properties of this class.""" base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.dev_ops_api_url_options.dev_ops_api_version, "databases", ) if ncomp is not None ) if comp != "" ] dev_ops_base_path = "/".join(base_path_components) dev_ops_commander = APICommander( api_endpoint=self.api_options.dev_ops_api_url_options.dev_ops_url, path=dev_ops_base_path, headers=self._dev_ops_commander_headers, callers=self.api_options.callers, dev_ops_api=True, redacted_header_names=self.api_options.redacted_header_names, ) return dev_ops_commander def _copy( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBAdmin: arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AstraDBAdmin(api_options=final_api_options) def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBAdmin: """ Create a clone of this AstraDBAdmin with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AstraDBAdmin instance. Example: >>> different_auth_astra_db_admin = my_astra_db_admin.with_options( ... token="AstraCS:xyz...", ... ) """ return self._copy( token=token, api_options=api_options, ) def list_databases( self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[AstraDBAdminDatabaseInfo]: """ Get the list of databases, as obtained with a request to the DevOps API. Args: include: a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases. provider: a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results. page_size: number of results per page from the DevOps API. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A list of AstraDBAdminDatabaseInfo objects. Example: >>> database_list = my_astra_db_admin.list_databases() >>> len(database_list) 3 >>> database_list[2].id '01234567-...' >>> database_list[2].status 'ACTIVE' >>> database_list[2].info.region 'eu-west-1' """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_databases_ctx( include=include, provider=provider, page_size=page_size, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) def _list_databases_ctx( self, *, include: str | None, provider: str | None, page_size: int | None, timeout_context: _TimeoutContext, ) -> list[AstraDBAdminDatabaseInfo]: # version of the method, but with timeouts made into a _TimeoutContext logger.info("getting databases (DevOps API)") request_params_0 = { k: v for k, v in { "include": include, "provider": provider, "limit": page_size or DEV_OPS_DEFAULT_DATABASES_PAGE_SIZE, }.items() if v is not None } responses: list[dict[str, Any]] = [] logger.info("request 0, getting databases (DevOps API)") response_0 = self._dev_ops_api_commander.request( http_method=HttpMethod.GET, request_params=request_params_0, timeout_context=timeout_context, ) if not isinstance(response_0, list): raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) logger.info("finished request 0, getting databases (DevOps API)") responses += [response_0] while len(responses[-1]) >= request_params_0["limit"]: if "id" not in responses[-1][-1]: raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) last_received_db_id = responses[-1][-1]["id"] request_params_n = { **request_params_0, **{"starting_after": last_received_db_id}, } logger.info( "request %s, getting databases (DevOps API)", len(responses), ) response_n = self._dev_ops_api_commander.request( http_method=HttpMethod.GET, request_params=request_params_n, timeout_context=timeout_context, ) logger.info( "finished request %s, getting databases (DevOps API)", len(responses), ) if not isinstance(response_n, list): raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) responses += [response_n] logger.info("finished getting databases (DevOps API)") return [ _recast_as_admin_database_info( db_dict, environment=self.api_options.environment, ) for response in responses for db_dict in response ] async def async_list_databases( self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[AstraDBAdminDatabaseInfo]: """ Get the list of databases, as obtained with a request to the DevOps API. Async version of the method, for use in an asyncio context. Args: include: a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases. provider: a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results. page_size: number of results per page from the DevOps API. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A list of AstraDBAdminDatabaseInfo objects. Example: >>> async def check_if_db_exists(db_id: str) -> bool: ... db_list = await my_astra_db_admin.async_list_databases() ... return db_id in db_list ... >>> asyncio.run(check_if_db_exists("xyz")) True >>> asyncio.run(check_if_db_exists("01234567-...")) False """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._async_list_databases_ctx( include=include, provider=provider, page_size=page_size, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) async def _async_list_databases_ctx( self, *, include: str | None, provider: str | None, page_size: int | None, timeout_context: _TimeoutContext, ) -> list[AstraDBAdminDatabaseInfo]: # version of the method, but with timeouts made into a _TimeoutContext logger.info("getting databases (DevOps API), async") request_params_0 = { k: v for k, v in { "include": include, "provider": provider, "limit": page_size or DEV_OPS_DEFAULT_DATABASES_PAGE_SIZE, }.items() if v is not None } responses: list[dict[str, Any]] = [] logger.info("request 0, getting databases (DevOps API), async") response_0 = await self._dev_ops_api_commander.async_request( http_method=HttpMethod.GET, request_params=request_params_0, timeout_context=timeout_context, ) if not isinstance(response_0, list): raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) logger.info("finished request 0, getting databases (DevOps API), async") responses += [response_0] while len(responses[-1]) >= request_params_0["limit"]: if "id" not in responses[-1][-1]: raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) last_received_db_id = responses[-1][-1]["id"] request_params_n = { **request_params_0, **{"starting_after": last_received_db_id}, } logger.info( "request %s, getting databases (DevOps API)", len(responses), ) response_n = await self._dev_ops_api_commander.async_request( http_method=HttpMethod.GET, request_params=request_params_n, timeout_context=timeout_context, ) logger.info( "finished request %s, getting databases (DevOps API), async", len(responses), ) if not isinstance(response_n, list): raise DevOpsAPIException( "Faulty response from get-databases DevOps API command.", ) responses += [response_n] logger.info("finished getting databases (DevOps API), async") return [ _recast_as_admin_database_info( db_dict, environment=self.api_options.environment, ) for response in responses for db_dict in response ] def database_info( self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Get the full information on a given database, through a request to the DevOps API. Args: id: the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef". database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> details_of_my_db = my_astra_db_admin.database_info("01234567-...") >>> details_of_my_db.id '01234567-...' >>> details_of_my_db.status 'ACTIVE' >>> details_of_my_db.info.region 'eu-west-1' """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._database_info_ctx( id=id, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) def _database_info_ctx( self, id: str, *, timeout_context: _TimeoutContext, ) -> AstraDBAdminDatabaseInfo: # version of the method, but with timeouts made into a _TimeoutContext logger.info(f"getting database info for '{id}' (DevOps API)") gd_response = self._dev_ops_api_commander.request( http_method=HttpMethod.GET, additional_path=id, timeout_context=timeout_context, ) logger.info(f"finished getting database info for '{id}' (DevOps API)") return _recast_as_admin_database_info( gd_response, environment=self.api_options.environment, ) async def async_database_info( self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Get the full information on a given database, through a request to the DevOps API. This is an awaitable method suitable for use within an asyncio event loop. Args: id: the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef". database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> async def check_if_db_active(db_id: str) -> bool: ... db_info = await my_astra_db_admin.async_database_info(db_id) ... return db_info.status == "ACTIVE" ... >>> asyncio.run(check_if_db_active("01234567-...")) True """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._async_database_info_ctx( id=id, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) async def _async_database_info_ctx( self, id: str, *, timeout_context: _TimeoutContext, ) -> AstraDBAdminDatabaseInfo: # version of the method, but with timeouts made into a _TimeoutContext logger.info(f"getting database info for '{id}' (DevOps API), async") gd_response = await self._dev_ops_api_commander.async_request( http_method=HttpMethod.GET, additional_path=id, timeout_context=timeout_context, ) logger.info(f"finished getting database info for '{id}' (DevOps API), async") return _recast_as_admin_database_info( gd_response, environment=self.api_options.environment, ) def create_database( self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a database as requested, optionally waiting for it to be ready. Args: name: the desired name for the database. cloud_provider: one of 'aws', 'gcp' or 'azure'. region: any of the available cloud regions. keyspace: name for the one keyspace the database starts with. If omitted, DevOps API will use its default. wait_until_active: if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. token: if supplied, is passed to the returned Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_new_db_admin = my_astra_db_admin.create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", ... ) >>> my_new_db = my_new_db_admin.get_database() >>> my_coll = my_new_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.1, 0.2]}) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) cd_payload = { k: v for k, v in { "name": name, "tier": "serverless", "cloudProvider": cloud_provider, "region": region, "capacityUnits": 1, "dbType": "vector", "keyspace": keyspace, }.items() if v is not None } timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info( f"creating database {name}/({cloud_provider}, {region}) (DevOps API)" ) cd_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, payload=cd_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cd_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"DB creation ('{name}') failed: API returned HTTP " f"{cd_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) new_database_id = cd_raw_response.headers["Location"] logger.info( "DevOps API returned from creating database " f"{name}/({cloud_provider}, {region})" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_PENDING while last_status_seen in { DEV_OPS_DATABASE_STATUS_PENDING, DEV_OPS_DATABASE_STATUS_INITIALIZING, }: logger.info(f"sleeping to poll for status of '{new_database_id}'") time.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) last_db_info = self._database_info_ctx( id=new_database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database {name} entered unexpected status {last_status_seen} after PENDING" ) # return the database instance logger.info( f"finished creating database '{new_database_id}' = " f"{name}/({cloud_provider}, {region}) (DevOps API)" ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=new_database_id, region=region, ), astra_db_admin=self, spawn_api_options=_final_api_options, ) async def async_create_database( self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a database as requested, optionally waiting for it to be ready. This is an awaitable method suitable for use within an asyncio event loop. Args: name: the desired name for the database. cloud_provider: one of 'aws', 'gcp' or 'azure'. region: any of the available cloud regions. keyspace: name for the one keyspace the database starts with. If omitted, DevOps API will use its default. wait_until_active: if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. token: if supplied, is passed to the returned Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_astra_db_admin.async_create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", .... ) ... ) AstraDBDatabaseAdmin(id=...) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) cd_payload = { k: v for k, v in { "name": name, "tier": "serverless", "cloudProvider": cloud_provider, "region": region, "capacityUnits": 1, "dbType": "vector", "keyspace": keyspace, }.items() if v is not None } timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info( f"creating database {name}/({cloud_provider}, {region}) " "(DevOps API), async" ) cd_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, payload=cd_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cd_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"DB creation ('{name}') failed: API returned HTTP " f"{cd_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) new_database_id = cd_raw_response.headers["Location"] logger.info( "DevOps API returned from creating database " f"{name}/({cloud_provider}, {region}), async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_PENDING while last_status_seen in { DEV_OPS_DATABASE_STATUS_PENDING, DEV_OPS_DATABASE_STATUS_INITIALIZING, }: logger.info( f"sleeping to poll for status of '{new_database_id}', async" ) await asyncio.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) last_db_info = await self._async_database_info_ctx( id=new_database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database {name} entered unexpected status " f"{last_status_seen} after PENDING" ) # return the database instance logger.info( f"finished creating database '{new_database_id}' = " f"{name}/({cloud_provider}, {region}) (DevOps API), async" ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=new_database_id, region=region, ), astra_db_admin=self, spawn_api_options=_final_api_options, ) def drop_database( self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a database, i.e. delete it completely and permanently with all its data. Args: id: The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef". wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> database_list_pre = my_astra_db_admin.list_databases() >>> len(database_list_pre) 3 >>> my_astra_db_admin.drop_database("01234567-...") >>> database_list_post = my_astra_db_admin.list_databases() >>> len(database_list_post) 2 """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info(f"dropping database '{id}' (DevOps API)") te_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, additional_path=f"{id}/terminate", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if te_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"DB deletion ('{id}') failed: API returned HTTP " f"{te_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info(f"DevOps API returned from dropping database '{id}'") if wait_until_active: last_status_seen: str | None = DEV_OPS_DATABASE_STATUS_TERMINATING _db_name: str | None = None while last_status_seen == DEV_OPS_DATABASE_STATUS_TERMINATING: logger.info(f"sleeping to poll for status of '{id}'") time.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) # detected_databases = [ a_db_info for a_db_info in self._list_databases_ctx( include=None, provider=None, page_size=None, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if a_db_info.id == id ] if detected_databases: last_status_seen = detected_databases[0].status _db_name = detected_databases[0].name else: last_status_seen = None if last_status_seen is not None: _name_desc = f" ({_db_name})" if _db_name else "" raise DevOpsAPIException( f"Database {id}{_name_desc} entered unexpected status " f"{last_status_seen} after PENDING" ) logger.info(f"finished dropping database '{id}' (DevOps API)") async def async_drop_database( self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context. Args: id: The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef". wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_astra_db_admin.async_drop_database("01234567-...") ... ) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info(f"dropping database '{id}' (DevOps API), async") te_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, additional_path=f"{id}/terminate", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if te_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"DB deletion ('{id}') failed: API returned HTTP " f"{te_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info(f"DevOps API returned from dropping database '{id}', async") if wait_until_active: last_status_seen: str | None = DEV_OPS_DATABASE_STATUS_TERMINATING _db_name: str | None = None while last_status_seen == DEV_OPS_DATABASE_STATUS_TERMINATING: logger.info(f"sleeping to poll for status of '{id}', async") await asyncio.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) # detected_databases = [ a_db_info for a_db_info in await self._async_list_databases_ctx( include=None, provider=None, page_size=None, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if a_db_info.id == id ] if detected_databases: last_status_seen = detected_databases[0].status _db_name = detected_databases[0].name else: last_status_seen = None if last_status_seen is not None: _name_desc = f" ({_db_name})" if _db_name else "" raise DevOpsAPIException( f"Database {id}{_name_desc} entered unexpected status " f"{last_status_seen} after PENDING" ) logger.info(f"finished dropping database '{id}' (DevOps API), async") def get_database_admin( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create an AstraDBDatabaseAdmin object for admin work within a certain database. Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance representing the requested database. Example: >>> my_db_admin = my_astra_db_admin.get_database_admin("01234567-...") >>> my_db_admin.list_keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method. """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _api_endpoint_p, _id_p = check_id_endpoint_parg_kwargs( p_arg=api_endpoint_or_id, api_endpoint=api_endpoint, id=id ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) # handle the "endpoint passed as id" case first: if _api_endpoint_p is not None: if region is not None: raise ValueError( "Parameter `region` not supported with an API endpoint." ) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=_api_endpoint_p, astra_db_admin=self, spawn_api_options=_final_api_options, ) else: if _id_p is None: raise ValueError("Either `api_endpoint` or `id` must be supplied.") _region = normalize_region_for_id( database_id=_id_p, environment=self.api_options.environment, region_param=region, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=_id_p, region=_region, ), astra_db_admin=self, spawn_api_options=_final_api_options, ) def get_database( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace: used to specify a certain keyspace the resulting Database will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object ready to be used. Example: >>> my_db = my_astra_db_admin.get_database( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... keyspace="my_prod_keyspace", ... ) >>> coll = my_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _api_endpoint_p, _id_p = check_id_endpoint_parg_kwargs( p_arg=api_endpoint_or_id, api_endpoint=api_endpoint, id=id ) # lazy importing here to avoid circular dependency from astrapy import Database _final_api_options = self.api_options.with_override( spawn_api_options ).with_override( APIOptions(token=token), ) _keyspace: str | None # handle the "endpoint passed as id" case first: if _api_endpoint_p is not None: if region is not None: raise ValueError( "Parameter `region` not supported with an API endpoint." ) if keyspace: _keyspace = keyspace else: parsed_api_endpoint = parse_api_endpoint(_api_endpoint_p) if parsed_api_endpoint is None: msg = api_endpoint_parsing_error_message(_api_endpoint_p) raise ValueError(msg) _keyspace = ( ( self.database_info( parsed_api_endpoint.database_id, timeout_ms=_database_admin_timeout_ms, ).raw or {} ).get("info") or {} ).get("keyspace", DEFAULT_ASTRA_DB_KEYSPACE) return Database( api_endpoint=_api_endpoint_p, keyspace=_keyspace, api_options=_final_api_options, ) else: # the case where an ID is passed: if _id_p is None: raise ValueError("Either `api_endpoint` or `id` must be supplied.") _region = normalize_region_for_id( database_id=_id_p, environment=self.api_options.environment, region_param=region, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if keyspace: _keyspace = keyspace else: _keyspace = ( ( self.database_info( _id_p, timeout_ms=_database_admin_timeout_ms ).raw or {} ).get("info") or {} ).get("keyspace", DEFAULT_ASTRA_DB_KEYSPACE) return Database( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=_id_p, region=_region, ), keyspace=_keyspace, api_options=_final_api_options, ) def get_async_database( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace: used to specify a certain keyspace the resulting AsyncDatabase will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object ready to be used. Example: >>> async def create_use_collection( ... admin: AstraDBAdmin, ... api_endpoint: str, ... keyspace: str, ... ) -> None: ... my_async_db = admin.get_async_database( ... api_endpoint, ... keyspace=keyspace, ... ) ... a_coll = await my_async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await a_coll.insert_one( ... {"title": "The Title", "$vector": [0.3, 0.4]} ... ) ... >>> asyncio.run(create_use_collection( ... my_admin, ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... "default_keyspace", ... )) >>> """ return self.get_database( api_endpoint_or_id=api_endpoint_or_id, api_endpoint=api_endpoint, token=token, keyspace=keyspace, id=id, region=region, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, ).to_async()
Methods
async def async_create_database(self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBDatabaseAdmin
-
Create a database as requested, optionally waiting for it to be ready. This is an awaitable method suitable for use within an asyncio event loop.
Args
name
- the desired name for the database.
cloud_provider
- one of 'aws', 'gcp' or 'azure'.
region
- any of the available cloud regions.
keyspace
- name for the one keyspace the database starts with. If omitted, DevOps API will use its default.
wait_until_active
- if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call. token
- if supplied, is passed to the returned Database instead of
the one set for this object.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> asyncio.run( ... my_astra_db_admin.async_create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", .... ) ... ) AstraDBDatabaseAdmin(id=...)
Expand source code
async def async_create_database( self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a database as requested, optionally waiting for it to be ready. This is an awaitable method suitable for use within an asyncio event loop. Args: name: the desired name for the database. cloud_provider: one of 'aws', 'gcp' or 'azure'. region: any of the available cloud regions. keyspace: name for the one keyspace the database starts with. If omitted, DevOps API will use its default. wait_until_active: if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. token: if supplied, is passed to the returned Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_astra_db_admin.async_create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", .... ) ... ) AstraDBDatabaseAdmin(id=...) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) cd_payload = { k: v for k, v in { "name": name, "tier": "serverless", "cloudProvider": cloud_provider, "region": region, "capacityUnits": 1, "dbType": "vector", "keyspace": keyspace, }.items() if v is not None } timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info( f"creating database {name}/({cloud_provider}, {region}) " "(DevOps API), async" ) cd_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, payload=cd_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cd_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"DB creation ('{name}') failed: API returned HTTP " f"{cd_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) new_database_id = cd_raw_response.headers["Location"] logger.info( "DevOps API returned from creating database " f"{name}/({cloud_provider}, {region}), async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_PENDING while last_status_seen in { DEV_OPS_DATABASE_STATUS_PENDING, DEV_OPS_DATABASE_STATUS_INITIALIZING, }: logger.info( f"sleeping to poll for status of '{new_database_id}', async" ) await asyncio.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) last_db_info = await self._async_database_info_ctx( id=new_database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database {name} entered unexpected status " f"{last_status_seen} after PENDING" ) # return the database instance logger.info( f"finished creating database '{new_database_id}' = " f"{name}/({cloud_provider}, {region}) (DevOps API), async" ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=new_database_id, region=region, ), astra_db_admin=self, spawn_api_options=_final_api_options, )
async def async_database_info(self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBAdminDatabaseInfo
-
Get the full information on a given database, through a request to the DevOps API. This is an awaitable method suitable for use within an asyncio event loop.
Args
id
- the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef".
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
An AstraDBAdminDatabaseInfo object.
Example
>>> async def check_if_db_active(db_id: str) -> bool: ... db_info = await my_astra_db_admin.async_database_info(db_id) ... return db_info.status == "ACTIVE" ... >>> asyncio.run(check_if_db_active("01234567-...")) True
Expand source code
async def async_database_info( self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Get the full information on a given database, through a request to the DevOps API. This is an awaitable method suitable for use within an asyncio event loop. Args: id: the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef". database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> async def check_if_db_active(db_id: str) -> bool: ... db_info = await my_astra_db_admin.async_database_info(db_id) ... return db_info.status == "ACTIVE" ... >>> asyncio.run(check_if_db_active("01234567-...")) True """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._async_database_info_ctx( id=id, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), )
async def async_drop_database(self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop a database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context.
Args
id
- The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef".
wait_until_active
- if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> asyncio.run( ... my_astra_db_admin.async_drop_database("01234567-...") ... )
Expand source code
async def async_drop_database( self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context. Args: id: The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef". wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_astra_db_admin.async_drop_database("01234567-...") ... ) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info(f"dropping database '{id}' (DevOps API), async") te_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, additional_path=f"{id}/terminate", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if te_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"DB deletion ('{id}') failed: API returned HTTP " f"{te_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info(f"DevOps API returned from dropping database '{id}', async") if wait_until_active: last_status_seen: str | None = DEV_OPS_DATABASE_STATUS_TERMINATING _db_name: str | None = None while last_status_seen == DEV_OPS_DATABASE_STATUS_TERMINATING: logger.info(f"sleeping to poll for status of '{id}', async") await asyncio.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) # detected_databases = [ a_db_info for a_db_info in await self._async_list_databases_ctx( include=None, provider=None, page_size=None, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if a_db_info.id == id ] if detected_databases: last_status_seen = detected_databases[0].status _db_name = detected_databases[0].name else: last_status_seen = None if last_status_seen is not None: _name_desc = f" ({_db_name})" if _db_name else "" raise DevOpsAPIException( f"Database {id}{_name_desc} entered unexpected status " f"{last_status_seen} after PENDING" ) logger.info(f"finished dropping database '{id}' (DevOps API), async")
async def async_list_databases(self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[AstraDBAdminDatabaseInfo]
-
Get the list of databases, as obtained with a request to the DevOps API. Async version of the method, for use in an asyncio context.
Args
include
- a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases.
provider
- a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results.
page_size
- number of results per page from the DevOps API.
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A list of AstraDBAdminDatabaseInfo objects.
Example
>>> async def check_if_db_exists(db_id: str) -> bool: ... db_list = await my_astra_db_admin.async_list_databases() ... return db_id in db_list ... >>> asyncio.run(check_if_db_exists("xyz")) True >>> asyncio.run(check_if_db_exists("01234567-...")) False
Expand source code
async def async_list_databases( self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[AstraDBAdminDatabaseInfo]: """ Get the list of databases, as obtained with a request to the DevOps API. Async version of the method, for use in an asyncio context. Args: include: a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases. provider: a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results. page_size: number of results per page from the DevOps API. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A list of AstraDBAdminDatabaseInfo objects. Example: >>> async def check_if_db_exists(db_id: str) -> bool: ... db_list = await my_astra_db_admin.async_list_databases() ... return db_id in db_list ... >>> asyncio.run(check_if_db_exists("xyz")) True >>> asyncio.run(check_if_db_exists("01234567-...")) False """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._async_list_databases_ctx( include=include, provider=provider, page_size=page_size, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), )
def create_database(self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBDatabaseAdmin
-
Create a database as requested, optionally waiting for it to be ready.
Args
name
- the desired name for the database.
cloud_provider
- one of 'aws', 'gcp' or 'azure'.
region
- any of the available cloud regions.
keyspace
- name for the one keyspace the database starts with. If omitted, DevOps API will use its default.
wait_until_active
- if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call. token
- if supplied, is passed to the returned Database instead of
the one set for this object.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> my_new_db_admin = my_astra_db_admin.create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", ... ) >>> my_new_db = my_new_db_admin.get_database() >>> my_coll = my_new_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.1, 0.2]})
Expand source code
def create_database( self, name: str, *, cloud_provider: str, region: str, keyspace: str | None = None, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a database as requested, optionally waiting for it to be ready. Args: name: the desired name for the database. cloud_provider: one of 'aws', 'gcp' or 'azure'. region: any of the available cloud regions. keyspace: name for the one keyspace the database starts with. If omitted, DevOps API will use its default. wait_until_active: if True (default), the method returns only after the newly-created database is in ACTIVE state (a few minutes, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status before working with it. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-created database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. token: if supplied, is passed to the returned Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_new_db_admin = my_astra_db_admin.create_database( ... "new_database", ... cloud_provider="aws", ... region="ap-south-1", ... ) >>> my_new_db = my_new_db_admin.get_database() >>> my_coll = my_new_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.1, 0.2]}) """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) cd_payload = { k: v for k, v in { "name": name, "tier": "serverless", "cloudProvider": cloud_provider, "region": region, "capacityUnits": 1, "dbType": "vector", "keyspace": keyspace, }.items() if v is not None } timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info( f"creating database {name}/({cloud_provider}, {region}) (DevOps API)" ) cd_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, payload=cd_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cd_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"DB creation ('{name}') failed: API returned HTTP " f"{cd_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) new_database_id = cd_raw_response.headers["Location"] logger.info( "DevOps API returned from creating database " f"{name}/({cloud_provider}, {region})" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_PENDING while last_status_seen in { DEV_OPS_DATABASE_STATUS_PENDING, DEV_OPS_DATABASE_STATUS_INITIALIZING, }: logger.info(f"sleeping to poll for status of '{new_database_id}'") time.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) last_db_info = self._database_info_ctx( id=new_database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database {name} entered unexpected status {last_status_seen} after PENDING" ) # return the database instance logger.info( f"finished creating database '{new_database_id}' = " f"{name}/({cloud_provider}, {region}) (DevOps API)" ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=new_database_id, region=region, ), astra_db_admin=self, spawn_api_options=_final_api_options, )
def database_info(self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBAdminDatabaseInfo
-
Get the full information on a given database, through a request to the DevOps API.
Args
id
- the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef".
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
An AstraDBAdminDatabaseInfo object.
Example
>>> details_of_my_db = my_astra_db_admin.database_info("01234567-...") >>> details_of_my_db.id '01234567-...' >>> details_of_my_db.status 'ACTIVE' >>> details_of_my_db.info.region 'eu-west-1'
Expand source code
def database_info( self, id: str, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Get the full information on a given database, through a request to the DevOps API. Args: id: the ID of the target database, e. g. "01234567-89ab-cdef-0123-456789abcdef". database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> details_of_my_db = my_astra_db_admin.database_info("01234567-...") >>> details_of_my_db.id '01234567-...' >>> details_of_my_db.status 'ACTIVE' >>> details_of_my_db.info.region 'eu-west-1' """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._database_info_ctx( id=id, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), )
def drop_database(self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop a database, i.e. delete it completely and permanently with all its data.
Args
id
- The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef".
wait_until_active
- if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> database_list_pre = my_astra_db_admin.list_databases() >>> len(database_list_pre) 3 >>> my_astra_db_admin.drop_database("01234567-...") >>> database_list_post = my_astra_db_admin.list_databases() >>> len(database_list_post) 2
Expand source code
def drop_database( self, id: str, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a database, i.e. delete it completely and permanently with all its data. Args: id: The ID of the database to drop, e. g. "01234567-89ab-cdef-0123-456789abcdef". wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> database_list_pre = my_astra_db_admin.list_databases() >>> len(database_list_pre) 3 >>> my_astra_db_admin.drop_database("01234567-...") >>> database_list_post = my_astra_db_admin.list_databases() >>> len(database_list_post) 2 """ _database_admin_timeout_ms, _da_label = _first_valid_timeout( (database_admin_timeout_ms, "database_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.database_admin_timeout_ms, "database_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_database_admin_timeout_ms, dev_ops_api=True, timeout_label=_da_label, ) logger.info(f"dropping database '{id}' (DevOps API)") te_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, additional_path=f"{id}/terminate", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if te_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"DB deletion ('{id}') failed: API returned HTTP " f"{te_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info(f"DevOps API returned from dropping database '{id}'") if wait_until_active: last_status_seen: str | None = DEV_OPS_DATABASE_STATUS_TERMINATING _db_name: str | None = None while last_status_seen == DEV_OPS_DATABASE_STATUS_TERMINATING: logger.info(f"sleeping to poll for status of '{id}'") time.sleep(DEV_OPS_DATABASE_POLL_INTERVAL_S) # detected_databases = [ a_db_info for a_db_info in self._list_databases_ctx( include=None, provider=None, page_size=None, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if a_db_info.id == id ] if detected_databases: last_status_seen = detected_databases[0].status _db_name = detected_databases[0].name else: last_status_seen = None if last_status_seen is not None: _name_desc = f" ({_db_name})" if _db_name else "" raise DevOpsAPIException( f"Database {id}{_name_desc} entered unexpected status " f"{last_status_seen} after PENDING" ) logger.info(f"finished dropping database '{id}' (DevOps API)")
def get_async_database(self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections).
Args
api_endpoint_or_id
- positional parameter that can stand for both
api_endpoint
andid
. Passing them together is an error. api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace
- used to specify a certain keyspace the resulting AsyncDatabase will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database.
id
- the target database ID. This is alternative to using the API Endpoint.
region
- the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently.
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
. token
- if supplied, is passed to the Database instead of
the one set for this object.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AsyncDatabase object ready to be used.
Example
>>> async def create_use_collection( ... admin: AstraDBAdmin, ... api_endpoint: str, ... keyspace: str, ... ) -> None: ... my_async_db = admin.get_async_database( ... api_endpoint, ... keyspace=keyspace, ... ) ... a_coll = await my_async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await a_coll.insert_one( ... {"title": "The Title", "$vector": [0.3, 0.4]} ... ) ... >>> asyncio.run(create_use_collection( ... my_admin, ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... "default_keyspace", ... )) >>>
Expand source code
def get_async_database( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace: used to specify a certain keyspace the resulting AsyncDatabase will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object ready to be used. Example: >>> async def create_use_collection( ... admin: AstraDBAdmin, ... api_endpoint: str, ... keyspace: str, ... ) -> None: ... my_async_db = admin.get_async_database( ... api_endpoint, ... keyspace=keyspace, ... ) ... a_coll = await my_async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await a_coll.insert_one( ... {"title": "The Title", "$vector": [0.3, 0.4]} ... ) ... >>> asyncio.run(create_use_collection( ... my_admin, ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... "default_keyspace", ... )) >>> """ return self.get_database( api_endpoint_or_id=api_endpoint_or_id, api_endpoint=api_endpoint, token=token, keyspace=keyspace, id=id, region=region, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, ).to_async()
def get_database(self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections).
Args
api_endpoint_or_id
- positional parameter that can stand for both
api_endpoint
andid
. Passing them together is an error. api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace
- used to specify a certain keyspace the resulting Database will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database.
id
- the target database ID. This is alternative to using the API Endpoint.
region
- the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently.
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
. token
- if supplied, is passed to the Database instead of
the one set for this object.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
A Database object ready to be used.
Example
>>> my_db = my_astra_db_admin.get_database( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... keyspace="my_prod_keyspace", ... ) >>> coll = my_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]})
Expand source code
def get_database( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, keyspace: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. keyspace: used to specify a certain keyspace the resulting Database will primarily work on. If not specified, an additional DevOps API call reveals the default keyspace for the target database. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object ready to be used. Example: >>> my_db = my_astra_db_admin.get_database( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... keyspace="my_prod_keyspace", ... ) >>> coll = my_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _api_endpoint_p, _id_p = check_id_endpoint_parg_kwargs( p_arg=api_endpoint_or_id, api_endpoint=api_endpoint, id=id ) # lazy importing here to avoid circular dependency from astrapy import Database _final_api_options = self.api_options.with_override( spawn_api_options ).with_override( APIOptions(token=token), ) _keyspace: str | None # handle the "endpoint passed as id" case first: if _api_endpoint_p is not None: if region is not None: raise ValueError( "Parameter `region` not supported with an API endpoint." ) if keyspace: _keyspace = keyspace else: parsed_api_endpoint = parse_api_endpoint(_api_endpoint_p) if parsed_api_endpoint is None: msg = api_endpoint_parsing_error_message(_api_endpoint_p) raise ValueError(msg) _keyspace = ( ( self.database_info( parsed_api_endpoint.database_id, timeout_ms=_database_admin_timeout_ms, ).raw or {} ).get("info") or {} ).get("keyspace", DEFAULT_ASTRA_DB_KEYSPACE) return Database( api_endpoint=_api_endpoint_p, keyspace=_keyspace, api_options=_final_api_options, ) else: # the case where an ID is passed: if _id_p is None: raise ValueError("Either `api_endpoint` or `id` must be supplied.") _region = normalize_region_for_id( database_id=_id_p, environment=self.api_options.environment, region_param=region, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if keyspace: _keyspace = keyspace else: _keyspace = ( ( self.database_info( _id_p, timeout_ms=_database_admin_timeout_ms ).raw or {} ).get("info") or {} ).get("keyspace", DEFAULT_ASTRA_DB_KEYSPACE) return Database( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=_id_p, region=_region, ), keyspace=_keyspace, api_options=_final_api_options, )
def get_database_admin(self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBDatabaseAdmin
-
Create an AstraDBDatabaseAdmin object for admin work within a certain database.
Args
api_endpoint_or_id
- positional parameter that can stand for both
api_endpoint
andid
. Passing them together is an error. api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. id
- the target database ID. This is alternative to using the API Endpoint.
region
- the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently.
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
. token
- if supplied, is passed to the Database instead of
the one set for this object.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AstraDBDatabaseAdmin instance representing the requested database.
Example
>>> my_db_admin = my_astra_db_admin.get_database_admin("01234567-...") >>> my_db_admin.list_keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one']
Note
This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the
create_database
method.Expand source code
def get_database_admin( self, api_endpoint_or_id: str | None = None, *, api_endpoint: str | None = None, id: str | None = None, region: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create an AstraDBDatabaseAdmin object for admin work within a certain database. Args: api_endpoint_or_id: positional parameter that can stand for both `api_endpoint` and `id`. Passing them together is an error. api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. id: the target database ID. This is alternative to using the API Endpoint. region: the region to use for connecting to the database. The database must be located in that region. This parameter can be used only if the database is specified by its ID (instead of API Endpoint). If this parameter is not passed, and cannot be inferred from the API endpoint, an additional DevOps API request is made to determine the default region and use it subsequently. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AstraDBDatabaseAdmin instance representing the requested database. Example: >>> my_db_admin = my_astra_db_admin.get_database_admin("01234567-...") >>> my_db_admin.list_keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method. """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _api_endpoint_p, _id_p = check_id_endpoint_parg_kwargs( p_arg=api_endpoint_or_id, api_endpoint=api_endpoint, id=id ) _final_api_options = self.api_options.with_override( spawn_api_options ).with_override(APIOptions(token=token)) # handle the "endpoint passed as id" case first: if _api_endpoint_p is not None: if region is not None: raise ValueError( "Parameter `region` not supported with an API endpoint." ) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=_api_endpoint_p, astra_db_admin=self, spawn_api_options=_final_api_options, ) else: if _id_p is None: raise ValueError("Either `api_endpoint` or `id` must be supplied.") _region = normalize_region_for_id( database_id=_id_p, environment=self.api_options.environment, region_param=region, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) return AstraDBDatabaseAdmin.from_astra_db_admin( api_endpoint=build_api_endpoint( environment=self.api_options.environment, database_id=_id_p, region=_region, ), astra_db_admin=self, spawn_api_options=_final_api_options, )
def list_databases(self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[AstraDBAdminDatabaseInfo]
-
Get the list of databases, as obtained with a request to the DevOps API.
Args
include
- a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases.
provider
- a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results.
page_size
- number of results per page from the DevOps API.
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A list of AstraDBAdminDatabaseInfo objects.
Example
>>> database_list = my_astra_db_admin.list_databases() >>> len(database_list) 3 >>> database_list[2].id '01234567-...' >>> database_list[2].status 'ACTIVE' >>> database_list[2].info.region 'eu-west-1'
Expand source code
def list_databases( self, *, include: str | None = None, provider: str | None = None, page_size: int | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[AstraDBAdminDatabaseInfo]: """ Get the list of databases, as obtained with a request to the DevOps API. Args: include: a filter on what databases are to be returned. As per DevOps API, defaults to "nonterminated". Pass "all" to include the already terminated databases. provider: a filter on the cloud provider for the databases. As per DevOps API, defaults to "ALL". Pass e.g. "AWS" to restrict the results. page_size: number of results per page from the DevOps API. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (While in the case of very many databases this method may entail multiple DevOps API requests, it is assumed here that this method amounts almost always to one single request: the only timeout imposed on this method execution is one acting on each individual request, with no checks on its overall completion time.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A list of AstraDBAdminDatabaseInfo objects. Example: >>> database_list = my_astra_db_admin.list_databases() >>> len(database_list) 3 >>> database_list[2].id '01234567-...' >>> database_list[2].status 'ACTIVE' >>> database_list[2].info.region 'eu-west-1' """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_databases_ctx( include=include, provider=provider, page_size=page_size, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), )
def with_options(self, *, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBAdmin
-
Create a clone of this AstraDBAdmin with some changed attributes.
Args
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new AstraDBAdmin instance.
Example
>>> different_auth_astra_db_admin = my_astra_db_admin.with_options( ... token="AstraCS:xyz...", ... )
Expand source code
def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBAdmin: """ Create a clone of this AstraDBAdmin with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AstraDBAdmin instance. Example: >>> different_auth_astra_db_admin = my_astra_db_admin.with_options( ... token="AstraCS:xyz...", ... ) """ return self._copy( token=token, api_options=api_options, )
class AstraDBDatabaseAdmin (*, api_endpoint: str, api_options: FullAPIOptions, spawner_database: Database | AsyncDatabase | None = None, spawner_astra_db_admin: AstraDBAdmin | None = None)
-
An "admin" object, able to perform administrative tasks at the keyspaces level (i.e. within a certain database), such as creating/listing/dropping keyspaces.
This is one layer below the AstraDBAdmin concept, in that it is tied to a single database and enables admin work within it.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_database_admin
of AstraDBAdmin.Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. api_options
- a complete specification of the API Options for this instance.
spawner_database
- either a Database or an AsyncDatabase instance. This represents the database class which spawns this admin object, so that, if required, a keyspace creation can retroactively "use" the new keyspace in the spawner. Used to enable the Async/Database.get_admin_database().create_keyspace() pattern.
spawner_astra_db_admin
- an AstraDBAdmin instance. This, if provided, is the instance that spawned this Database Admin and is used to delegate operations such as drop, get_database and so on. If not passed, a new one is created automatically.
Example
>>> from astrapy import DataAPIClient >>> my_client = DataAPIClient("AstraCS:...") >>> admin_for_my_db = my_client.get_admin().get_database_admin( ... "https://<ID>-<REGION>.apps.astra.datastax.com" ... ) >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] >>> admin_for_my_db.info().status 'ACTIVE'
Note
creating an instance of AstraDBDatabaseAdmin does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class.
Note
a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered.
Expand source code
class AstraDBDatabaseAdmin(DatabaseAdmin): """ An "admin" object, able to perform administrative tasks at the keyspaces level (i.e. within a certain database), such as creating/listing/dropping keyspaces. This is one layer below the AstraDBAdmin concept, in that it is tied to a single database and enables admin work within it. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_database_admin` of AstraDBAdmin. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. api_options: a complete specification of the API Options for this instance. spawner_database: either a Database or an AsyncDatabase instance. This represents the database class which spawns this admin object, so that, if required, a keyspace creation can retroactively "use" the new keyspace in the spawner. Used to enable the Async/Database.get_admin_database().create_keyspace() pattern. spawner_astra_db_admin: an AstraDBAdmin instance. This, if provided, is the instance that spawned this Database Admin and is used to delegate operations such as drop, get_database and so on. If not passed, a new one is created automatically. Example: >>> from astrapy import DataAPIClient >>> my_client = DataAPIClient("AstraCS:...") >>> admin_for_my_db = my_client.get_admin().get_database_admin( ... "https://<ID>-<REGION>.apps.astra.datastax.com" ... ) >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] >>> admin_for_my_db.info().status 'ACTIVE' Note: creating an instance of AstraDBDatabaseAdmin does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. Note: a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered. """ def __init__( self, *, api_endpoint: str, api_options: FullAPIOptions, spawner_database: Database | AsyncDatabase | None = None, spawner_astra_db_admin: AstraDBAdmin | None = None, ) -> None: # lazy import here to avoid circular dependency from astrapy.database import Database if api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) self.api_options = api_options self.api_endpoint = api_endpoint parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is None: msg = api_endpoint_parsing_error_message(self.api_endpoint) raise ValueError(msg) self._database_id = parsed_api_endpoint.database_id self._region = parsed_api_endpoint.region if parsed_api_endpoint.environment != self.api_options.environment: raise InvalidEnvironmentException( "Environment mismatch between client and provided " "API endpoint. You can try adding " f'`environment="{parsed_api_endpoint.environment}"` ' "to the class constructor." ) if spawner_database is not None: self.spawner_database = spawner_database else: # leaving the keyspace to its per-environment default # (a task for the Database) self.spawner_database = Database( api_endpoint=self.api_endpoint, keyspace=None, api_options=self.api_options, ) # API-commander-specific init (for the vectorizeOps invocations) # even if Data API, this is admin and must use the Admin additional headers: self._commander_headers = { DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token(), **self.api_options.admin_additional_headers, } self._api_commander = self._get_api_commander() # DevOps-API-commander specific init (keyspace CRUD, etc) self._dev_ops_commander_headers: dict[str, str | None] if self.api_options.token: _token = self.api_options.token.get_token() self._dev_ops_commander_headers = { DEFAULT_DEV_OPS_AUTH_HEADER: f"{DEFAULT_DEV_OPS_AUTH_PREFIX}{_token}", **self.api_options.admin_additional_headers, } else: self._dev_ops_commander_headers = { **self.api_options.admin_additional_headers, } self._dev_ops_api_commander = self._get_dev_ops_api_commander() # this class keeps a reference to the AstraDBAdmin associated to this org: if spawner_astra_db_admin is None: self._astra_db_admin = AstraDBAdmin(api_options=self.api_options) else: self._astra_db_admin = spawner_astra_db_admin def __repr__(self) -> str: parts = [ f'api_endpoint="{self.api_endpoint}"', f"api_options={self.api_options}", ] return f"{self.__class__.__name__}({', '.join(parts)})" def __eq__(self, other: Any) -> bool: if isinstance(other, AstraDBDatabaseAdmin): return all( [ self.api_endpoint == other.api_endpoint, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self) -> APICommander: """Instantiate a new APICommander for Data API calls.""" base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) return api_commander def _get_dev_ops_api_commander(self) -> APICommander: """Instantiate a new APICommander for DevOps calls.""" base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.dev_ops_api_url_options.dev_ops_api_version, "databases", self._database_id, ) if ncomp is not None ) if comp != "" ] dev_ops_base_path = "/".join(base_path_components) dev_ops_commander = APICommander( api_endpoint=self.api_options.dev_ops_api_url_options.dev_ops_url, path=dev_ops_base_path, headers=self._dev_ops_commander_headers, callers=self.api_options.callers, dev_ops_api=True, redacted_header_names=self.api_options.redacted_header_names, ) return dev_ops_commander def _copy( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AstraDBDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=final_api_options, spawner_database=self.spawner_database, spawner_astra_db_admin=self._astra_db_admin, ) def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a clone of this AstraDBDatabaseAdmin with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AstraDBDatabaseAdmin instance. Example: >>> admin_for_my_other_db = admin_for_my_db.with_options( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... ) """ return self._copy( token=token, api_options=api_options, ) @property def id(self) -> str: """ The ID of this database admin. Example: >>> my_db_admin.id '01234567-89ab-cdef-0123-456789abcdef' """ return self._database_id @property def region(self) -> str: """ The region for this database admin. Example: >>> my_db_admin.region 'us-east-1' """ return self._region @staticmethod def from_astra_db_admin( api_endpoint: str, *, astra_db_admin: AstraDBAdmin, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create an AstraDBDatabaseAdmin from an AstraDBAdmin and an API Endpoint. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. astra_db_admin: an AstraDBAdmin object that has visibility over the target database. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts. Returns: An AstraDBDatabaseAdmin object, for admin work within the database. Example: >>> from astrapy import DataAPIClient, AstraDBDatabaseAdmin >>> admin_for_my_db = AstraDBDatabaseAdmin.from_astra_db_admin( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... astra_db_admin=DataAPIClient("AstraCS:...").get_admin(), ... ) >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] >>> admin_for_my_db.info().status 'ACTIVE' Note: Creating an instance of AstraDBDatabaseAdmin does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ return AstraDBDatabaseAdmin( api_endpoint=api_endpoint, api_options=astra_db_admin.api_options.with_override(spawn_api_options), spawner_astra_db_admin=astra_db_admin, ) def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Query the DevOps API for the full info on this database. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> my_db_info = admin_for_my_db.info() >>> my_db_info.status 'ACTIVE' >>> my_db_info.info.region 'us-east1' """ logger.info(f"getting info ('{self._database_id}')") req_response = self._astra_db_admin.database_info( id=self._database_id, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting info ('{self._database_id}')") return req_response async def async_info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Query the DevOps API for the full info on this database. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> async def wait_until_active(db_admin: AstraDBDatabaseAdmin) -> None: ... while True: ... info = await db_admin.async_info() ... if info.status == "ACTIVE": ... return ... >>> asyncio.run(wait_until_active(admin_for_my_db)) """ logger.info(f"getting info ('{self._database_id}'), async") req_response = await self._astra_db_admin.async_database_info( id=self._database_id, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting info ('{self._database_id}'), async") return req_response def list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the DevOps API for a list of the keyspaces in the database. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] """ logger.info(f"getting keyspaces ('{self._database_id}')") info = self.info( database_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting keyspaces ('{self._database_id}')") if info.raw is None: raise DevOpsAPIException("Could not get the keyspace list.") else: return info.raw.get("info", {}).get("keyspaces") or [] async def async_list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the DevOps API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> async def check_if_ks_exists( ... db_admin: AstraDBDatabaseAdmin, keyspace: str ... ) -> bool: ... ks_list = await db_admin.async_list_keyspaces() ... return keyspace in ks_list ... >>> asyncio.run(check_if_ks_exists(admin_for_my_db, "dragons")) False >>> asyncio.run(check_if_db_exists(admin_for_my_db, "app_keyspace")) True """ logger.info(f"getting keyspaces ('{self._database_id}'), async") info = await self.async_info( database_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting keyspaces ('{self._database_id}'), async") if info.raw is None: raise DevOpsAPIException("Could not get the keyspace list.") else: return info.raw.get("info", {}).get("keyspaces") or [] def create_keyspace( self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in this database as requested, optionally waiting for it to be ready. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) cn_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cn_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"keyspace creation ('{name}') failed: API returned HTTP " f"{cn_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) logger.info( "DevOps API returned from creating keyspace " f"'{name}' on '{self._database_id}'" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info(f"sleeping to poll for status of '{self._database_id}'") time.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_status_seen = self._astra_db_admin._database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ).status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name not in self.list_keyspaces(): raise DevOpsAPIException("Could not create the keyspace.") logger.info( f"finished creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) if update_db_keyspace: self.spawner_database.use_keyspace(name) async def async_create_keyspace( self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in this database as requested, optionally waiting for it to be ready. Async version of the method, for use in an asyncio context. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_db_admin.async_create_keyspace("app_keyspace") ... ) """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) cn_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cn_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"keyspace creation ('{name}') failed: API returned HTTP " f"{cn_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) logger.info( f"DevOps API returned from creating keyspace " f"'{name}' on '{self._database_id}', async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info( f"sleeping to poll for status of '{self._database_id}', async" ) await asyncio.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_db_info = await self._astra_db_admin._async_database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name not in await self.async_list_keyspaces(): raise DevOpsAPIException("Could not create the keyspace.") logger.info( f"finished creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) if update_db_keyspace: self.spawner_database.use_keyspace(name) def drop_keyspace( self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a keyspace from the database, optionally waiting for the database to become active again. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) dk_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.DELETE, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if dk_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"keyspace deletion ('{id}') failed: API returned HTTP " f"{dk_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info( "DevOps API returned from dropping keyspace " f"'{name}' on '{self._database_id}'" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info(f"sleeping to poll for status of '{self._database_id}'") time.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_status_seen = self._astra_db_admin._database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ).status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name in self.list_keyspaces(): raise DevOpsAPIException("Could not drop the keyspace.") logger.info( f"finished dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) async def async_drop_keyspace( self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a keyspace from the database, optionally waiting for the database to become active again. Async version of the method, for use in an asyncio context. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_db_admin.async_drop_keyspace("app_keyspace") ... ) """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) dk_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.DELETE, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if dk_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"keyspace deletion ('{id}') failed: API returned HTTP " f"{dk_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info( f"DevOps API returned from dropping keyspace " f"'{name}' on '{self._database_id}', async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info( f"sleeping to poll for status of '{self._database_id}', async" ) await asyncio.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_db_info = await self._astra_db_admin._async_database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name in await self.async_list_keyspaces(): raise DevOpsAPIException("Could not drop the keyspace.") logger.info( f"finished dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) def drop( self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop this database, i.e. delete it completely and permanently with all its data. This method wraps the `drop_database` method of the AstraDBAdmin class, where more information may be found. Args: wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop() >>> my_db_admin.list_keyspaces() # raises a 404 Not Found http error Note: Once the method succeeds, methods on this object -- such as `info()`, or `list_keyspaces()` -- can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further. """ logger.info(f"dropping this database ('{self._database_id}')") return self._astra_db_admin.drop_database( id=self._database_id, wait_until_active=wait_until_active, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping this database ('{self._database_id}')") async def async_drop( self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop this database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context. This method wraps the `drop_database` method of the AstraDBAdmin class, where more information may be found. Args: wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run(my_db_admin.async_drop()) Note: Once the method succeeds, methods on this object -- such as `info()`, or `list_keyspaces()` -- can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further. """ logger.info(f"dropping this database ('{self._database_id}'), async") return await self._astra_db_admin.async_drop_database( id=self._database_id, wait_until_active=wait_until_active, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping this database ('{self._database_id}'), async") def get_database( self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance from this database admin, for data-related tasks. Args: keyspace: an optional keyspace to set in the resulting Database. The same default logic as for `AstraDBAdmin.get_database` applies. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object, ready to be used for working with data and collections. Example: >>> my_db = my_db_admin.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection'] Note: creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ return self._astra_db_admin.get_database( api_endpoint=self.api_endpoint, token=token, keyspace=keyspace, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, ) def get_async_database( self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance from this database admin, for data-related tasks. Args: keyspace: an optional keyspace to set in the resulting AsyncDatabase. The same default logic as for `AstraDBAdmin.get_database` applies. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the AsyncDatabase instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object, ready to be used for working with data and collections. """ return self._astra_db_admin.get_database( api_endpoint=self.api_endpoint, token=token, keyspace=keyspace, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, ).to_async() def find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders") fe_response = self._api_commander.request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders") return FindEmbeddingProvidersResult._from_dict(fe_response["status"]) async def async_find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders, async") fe_response = await self._api_commander.async_request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders, async") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
Ancestors
- DatabaseAdmin
- abc.ABC
Static methods
def from_astra_db_admin(api_endpoint: str, *, astra_db_admin: AstraDBAdmin, spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBDatabaseAdmin
-
Create an AstraDBDatabaseAdmin from an AstraDBAdmin and an API Endpoint.
Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. astra_db_admin
- an AstraDBAdmin object that has visibility over the target database.
spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts.
Returns
An AstraDBDatabaseAdmin object, for admin work within the database.
Example
>>> from astrapy import DataAPIClient, AstraDBDatabaseAdmin >>> admin_for_my_db = AstraDBDatabaseAdmin.from_astra_db_admin( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... astra_db_admin=DataAPIClient("AstraCS:...").get_admin(), ... ) >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] >>> admin_for_my_db.info().status 'ACTIVE'
Note
Creating an instance of AstraDBDatabaseAdmin does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class.
Expand source code
@staticmethod def from_astra_db_admin( api_endpoint: str, *, astra_db_admin: AstraDBAdmin, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create an AstraDBDatabaseAdmin from an AstraDBAdmin and an API Endpoint. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. astra_db_admin: an AstraDBAdmin object that has visibility over the target database. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the AstraDBAdmin. This allows for a deeper configuration of the database, e.g. concerning timeouts. Returns: An AstraDBDatabaseAdmin object, for admin work within the database. Example: >>> from astrapy import DataAPIClient, AstraDBDatabaseAdmin >>> admin_for_my_db = AstraDBDatabaseAdmin.from_astra_db_admin( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... astra_db_admin=DataAPIClient("AstraCS:...").get_admin(), ... ) >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] >>> admin_for_my_db.info().status 'ACTIVE' Note: Creating an instance of AstraDBDatabaseAdmin does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ return AstraDBDatabaseAdmin( api_endpoint=api_endpoint, api_options=astra_db_admin.api_options.with_override(spawn_api_options), spawner_astra_db_admin=astra_db_admin, )
Instance variables
var id : str
-
The ID of this database admin.
Example
>>> my_db_admin.id '01234567-89ab-cdef-0123-456789abcdef'
Expand source code
@property def id(self) -> str: """ The ID of this database admin. Example: >>> my_db_admin.id '01234567-89ab-cdef-0123-456789abcdef' """ return self._database_id
var region : str
-
The region for this database admin.
Example
>>> my_db_admin.region 'us-east-1'
Expand source code
@property def region(self) -> str: """ The region for this database admin. Example: >>> my_db_admin.region 'us-east-1' """ return self._region
Methods
async def async_create_keyspace(self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any) ‑> None
-
Create a keyspace in this database as requested, optionally waiting for it to be ready. Async version of the method, for use in an asyncio context.
Args
name
- the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op.
wait_until_active
- if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it.
update_db_keyspace
- if True, the
Database
orAsyncDatabase
class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
andkeyspace_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> asyncio.run( ... my_db_admin.async_create_keyspace("app_keyspace") ... )
Expand source code
async def async_create_keyspace( self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in this database as requested, optionally waiting for it to be ready. Async version of the method, for use in an asyncio context. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_db_admin.async_create_keyspace("app_keyspace") ... ) """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) cn_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.POST, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cn_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"keyspace creation ('{name}') failed: API returned HTTP " f"{cn_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) logger.info( f"DevOps API returned from creating keyspace " f"'{name}' on '{self._database_id}', async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info( f"sleeping to poll for status of '{self._database_id}', async" ) await asyncio.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_db_info = await self._astra_db_admin._async_database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name not in await self.async_list_keyspaces(): raise DevOpsAPIException("Could not create the keyspace.") logger.info( f"finished creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) if update_db_keyspace: self.spawner_database.use_keyspace(name)
async def async_drop(self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop this database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context.
This method wraps the
drop_database
method of the AstraDBAdmin class, where more information may be found.Args
wait_until_active
- if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> asyncio.run(my_db_admin.async_drop())
Note
Once the method succeeds, methods on this object – such as
astrapy.info
, orlist_keyspaces()
– can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further.Expand source code
async def async_drop( self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop this database, i.e. delete it completely and permanently with all its data. Async version of the method, for use in an asyncio context. This method wraps the `drop_database` method of the AstraDBAdmin class, where more information may be found. Args: wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run(my_db_admin.async_drop()) Note: Once the method succeeds, methods on this object -- such as `info()`, or `list_keyspaces()` -- can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further. """ logger.info(f"dropping this database ('{self._database_id}'), async") return await self._astra_db_admin.async_drop_database( id=self._database_id, wait_until_active=wait_until_active, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping this database ('{self._database_id}'), async")
async def async_drop_keyspace(self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete a keyspace from the database, optionally waiting for the database to become active again. Async version of the method, for use in an asyncio context.
Args
name
- the keyspace to delete. If it does not exist in this database, an error is raised.
wait_until_active
- if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it.
keyspace_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
andkeyspace_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> asyncio.run( ... my_db_admin.async_drop_keyspace("app_keyspace") ... )
Expand source code
async def async_drop_keyspace( self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a keyspace from the database, optionally waiting for the database to become active again. Async version of the method, for use in an asyncio context. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> asyncio.run( ... my_db_admin.async_drop_keyspace("app_keyspace") ... ) """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" ) dk_raw_response = await self._dev_ops_api_commander.async_raw_request( http_method=HttpMethod.DELETE, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if dk_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"keyspace deletion ('{id}') failed: API returned HTTP " f"{dk_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info( f"DevOps API returned from dropping keyspace " f"'{name}' on '{self._database_id}', async" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info( f"sleeping to poll for status of '{self._database_id}', async" ) await asyncio.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_db_info = await self._astra_db_admin._async_database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) last_status_seen = last_db_info.status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name in await self.async_list_keyspaces(): raise DevOpsAPIException("Could not drop the keyspace.") logger.info( f"finished dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API), async" )
async def async_find_embedding_providers(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> FindEmbeddingProvidersResult
-
Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A
FindEmbeddingProvidersResult
object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=…, openai, …) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), … ] ), … }Expand source code
async def async_find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders, async") fe_response = await self._api_commander.async_request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders, async") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
async def async_info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBAdminDatabaseInfo
-
Query the DevOps API for the full info on this database. Async version of the method, for use in an asyncio context.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
An AstraDBAdminDatabaseInfo object.
Example
>>> async def wait_until_active(db_admin: AstraDBDatabaseAdmin) -> None: ... while True: ... info = await db_admin.async_info() ... if info.status == "ACTIVE": ... return ... >>> asyncio.run(wait_until_active(admin_for_my_db))
Expand source code
async def async_info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Query the DevOps API for the full info on this database. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> async def wait_until_active(db_admin: AstraDBDatabaseAdmin) -> None: ... while True: ... info = await db_admin.async_info() ... if info.status == "ACTIVE": ... return ... >>> asyncio.run(wait_until_active(admin_for_my_db)) """ logger.info(f"getting info ('{self._database_id}'), async") req_response = await self._astra_db_admin.async_database_info( id=self._database_id, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting info ('{self._database_id}'), async") return req_response
async def async_list_keyspaces(self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
Query the DevOps API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context.
Args
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Returns
A list of the keyspaces, each a string, in no particular order.
Example
>>> async def check_if_ks_exists( ... db_admin: AstraDBDatabaseAdmin, keyspace: str ... ) -> bool: ... ks_list = await db_admin.async_list_keyspaces() ... return keyspace in ks_list ... >>> asyncio.run(check_if_ks_exists(admin_for_my_db, "dragons")) False >>> asyncio.run(check_if_db_exists(admin_for_my_db, "app_keyspace")) True
Expand source code
async def async_list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the DevOps API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> async def check_if_ks_exists( ... db_admin: AstraDBDatabaseAdmin, keyspace: str ... ) -> bool: ... ks_list = await db_admin.async_list_keyspaces() ... return keyspace in ks_list ... >>> asyncio.run(check_if_ks_exists(admin_for_my_db, "dragons")) False >>> asyncio.run(check_if_db_exists(admin_for_my_db, "app_keyspace")) True """ logger.info(f"getting keyspaces ('{self._database_id}'), async") info = await self.async_info( database_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting keyspaces ('{self._database_id}'), async") if info.raw is None: raise DevOpsAPIException("Could not get the keyspace list.") else: return info.raw.get("info", {}).get("keyspaces") or []
def create_keyspace(self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any) ‑> None
-
Create a keyspace in this database as requested, optionally waiting for it to be ready.
Args
name
- the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op.
wait_until_active
- if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it.
update_db_keyspace
- if True, the
Database
orAsyncDatabase
class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
andkeyspace_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> my_db_admin.keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one']
Expand source code
def create_keyspace( self, name: str, *, wait_until_active: bool = True, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in this database as requested, optionally waiting for it to be ready. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the creation request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace creation. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.keyspaces() ['default_keyspace'] >>> my_db_admin.create_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) cn_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.POST, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if cn_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_CREATED: raise DevOpsAPIException( f"keyspace creation ('{name}') failed: API returned HTTP " f"{cn_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_CREATED} - Created." ) logger.info( "DevOps API returned from creating keyspace " f"'{name}' on '{self._database_id}'" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info(f"sleeping to poll for status of '{self._database_id}'") time.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_status_seen = self._astra_db_admin._database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ).status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name not in self.list_keyspaces(): raise DevOpsAPIException("Could not create the keyspace.") logger.info( f"finished creating keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) if update_db_keyspace: self.spawner_database.use_keyspace(name)
def drop(self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop this database, i.e. delete it completely and permanently with all its data.
This method wraps the
drop_database
method of the AstraDBAdmin class, where more information may be found.Args
wait_until_active
- if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired.
database_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
anddatabase_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop() >>> my_db_admin.list_keyspaces() # raises a 404 Not Found http error
Note
Once the method succeeds, methods on this object – such as
astrapy.info
, orlist_keyspaces()
– can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further.Expand source code
def drop( self, *, wait_until_active: bool = True, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop this database, i.e. delete it completely and permanently with all its data. This method wraps the `drop_database` method of the AstraDBAdmin class, where more information may be found. Args: wait_until_active: if True (default), the method returns only after the database has actually been deleted (generally a few minutes). If False, it will return right after issuing the drop request to the DevOps API, and it will be responsibility of the caller to check the database status/availability after that, if desired. database_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the newly-deleted database. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `database_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop() >>> my_db_admin.list_keyspaces() # raises a 404 Not Found http error Note: Once the method succeeds, methods on this object -- such as `info()`, or `list_keyspaces()` -- can still be invoked: however, this hardly makes sense as the underlying actual database is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased database any further. """ logger.info(f"dropping this database ('{self._database_id}')") return self._astra_db_admin.drop_database( id=self._database_id, wait_until_active=wait_until_active, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping this database ('{self._database_id}')")
def drop_keyspace(self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete a keyspace from the database, optionally waiting for the database to become active again.
Args
name
- the keyspace to delete. If it does not exist in this database, an error is raised.
wait_until_active
- if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it.
keyspace_admin_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation to complete. This is used only
if
wait_until_active
is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms
- a timeout, in milliseconds, for each underlying DevOps API HTTP request.
timeout_ms
- an alias for both the
request_timeout_ms
andkeyspace_admin_timeout_ms
timeout parameters. In practice, regardless ofwait_until_active
, this parameter dictates an overall timeout on this method call.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace']
Expand source code
def drop_keyspace( self, name: str, *, wait_until_active: bool = True, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a keyspace from the database, optionally waiting for the database to become active again. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. wait_until_active: if True (default), the method returns only after the target database is in ACTIVE state again (a few seconds, usually). If False, it will return right after issuing the deletion request to the DevOps API, and it will be responsibility of the caller to check the database status/keyspace availability before working with it. keyspace_admin_timeout_ms: a timeout, in milliseconds, for the whole requested operation to complete. This is used only if `wait_until_active` is true, i.e. if the method call must wait and keep querying the DevOps API for the status of the database during keyspace deletion. request_timeout_ms: a timeout, in milliseconds, for each underlying DevOps API HTTP request. timeout_ms: an alias for *both* the `request_timeout_ms` and `keyspace_admin_timeout_ms` timeout parameters. In practice, regardless of `wait_until_active`, this parameter dictates an overall timeout on this method call. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> my_db_admin.drop_keyspace("that_other_one") >>> my_db_admin.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _first_valid_timeout( (keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.keyspace_admin_timeout_ms, "keyspace_admin_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_keyspace_admin_timeout_ms, dev_ops_api=True, timeout_label=_ka_label, ) logger.info( f"dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" ) dk_raw_response = self._dev_ops_api_commander.raw_request( http_method=HttpMethod.DELETE, additional_path=f"keyspaces/{name}", timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) if dk_raw_response.status_code != DEV_OPS_RESPONSE_HTTP_ACCEPTED: raise DevOpsAPIException( f"keyspace deletion ('{id}') failed: API returned HTTP " f"{dk_raw_response.status_code} instead of " f"{DEV_OPS_RESPONSE_HTTP_ACCEPTED} - Created" ) logger.info( "DevOps API returned from dropping keyspace " f"'{name}' on '{self._database_id}'" ) if wait_until_active: last_status_seen = DEV_OPS_DATABASE_STATUS_MAINTENANCE while last_status_seen == DEV_OPS_DATABASE_STATUS_MAINTENANCE: logger.info(f"sleeping to poll for status of '{self._database_id}'") time.sleep(DEV_OPS_KEYSPACE_POLL_INTERVAL_S) last_status_seen = self._astra_db_admin._database_info_ctx( id=self._database_id, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ).status if last_status_seen != DEV_OPS_DATABASE_STATUS_ACTIVE: raise DevOpsAPIException( f"Database entered unexpected status {last_status_seen} after MAINTENANCE." ) # is the keyspace found? if name in self.list_keyspaces(): raise DevOpsAPIException("Could not drop the keyspace.") logger.info( f"finished dropping keyspace '{name}' on " f"'{self._database_id}' (DevOps API)" )
def find_embedding_providers(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> FindEmbeddingProvidersResult
-
Query the API for the full information on available embedding providers.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A
FindEmbeddingProvidersResult
object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=…, openai, …) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), … ] ), … }Expand source code
def find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders") fe_response = self._api_commander.request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
def get_async_database(self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Create an AsyncDatabase instance from this database admin, for data-related tasks.
Args
keyspace
- an optional keyspace to set in the resulting AsyncDatabase.
The same default logic as for
AstraDBAdmin.get_database()
applies. database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
. token
- if supplied, is passed to the AsyncDatabase instead of
the one set for this object. Useful if one wants to work in
a least-privilege manner, limiting the permissions for non-admin work.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AsyncDatabase object, ready to be used for working with data and collections.
Expand source code
def get_async_database( self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance from this database admin, for data-related tasks. Args: keyspace: an optional keyspace to set in the resulting AsyncDatabase. The same default logic as for `AstraDBAdmin.get_database` applies. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the AsyncDatabase instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object, ready to be used for working with data and collections. """ return self._astra_db_admin.get_database( api_endpoint=self.api_endpoint, token=token, keyspace=keyspace, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, ).to_async()
def get_database(self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Create a Database instance from this database admin, for data-related tasks.
Args
keyspace
- an optional keyspace to set in the resulting Database.
The same default logic as for
AstraDBAdmin.get_database()
applies. database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
. token
- if supplied, is passed to the Database instead of
the one set for this object. Useful if one wants to work in
a least-privilege manner, limiting the permissions for non-admin work.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
A Database object, ready to be used for working with data and collections.
Example
>>> my_db = my_db_admin.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection']
Note
creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class.
Expand source code
def get_database( self, *, keyspace: str | None = None, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance from this database admin, for data-related tasks. Args: keyspace: an optional keyspace to set in the resulting Database. The same default logic as for `AstraDBAdmin.get_database` applies. database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request for 'region', should it be necessary. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. token: if supplied, is passed to the Database instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object, ready to be used for working with data and collections. Example: >>> my_db = my_db_admin.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection'] Note: creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ return self._astra_db_admin.get_database( api_endpoint=self.api_endpoint, token=token, keyspace=keyspace, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, spawn_api_options=spawn_api_options, )
def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBAdminDatabaseInfo
-
Query the DevOps API for the full info on this database.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
An AstraDBAdminDatabaseInfo object.
Example
>>> my_db_info = admin_for_my_db.info() >>> my_db_info.status 'ACTIVE' >>> my_db_info.info.region 'us-east1'
Expand source code
def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBAdminDatabaseInfo: """ Query the DevOps API for the full info on this database. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: An AstraDBAdminDatabaseInfo object. Example: >>> my_db_info = admin_for_my_db.info() >>> my_db_info.status 'ACTIVE' >>> my_db_info.info.region 'us-east1' """ logger.info(f"getting info ('{self._database_id}')") req_response = self._astra_db_admin.database_info( id=self._database_id, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting info ('{self._database_id}')") return req_response
def list_keyspaces(self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
Query the DevOps API for a list of the keyspaces in the database.
Args
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Returns
A list of the keyspaces, each a string, in no particular order.
Example
>>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace']
Expand source code
def list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the DevOps API for a list of the keyspaces in the database. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] """ logger.info(f"getting keyspaces ('{self._database_id}')") info = self.info( database_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished getting keyspaces ('{self._database_id}')") if info.raw is None: raise DevOpsAPIException("Could not get the keyspace list.") else: return info.raw.get("info", {}).get("keyspaces") or []
def with_options(self, *, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBDatabaseAdmin
-
Create a clone of this AstraDBDatabaseAdmin with some changed attributes.
Args
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new AstraDBDatabaseAdmin instance.
Example
>>> admin_for_my_other_db = admin_for_my_db.with_options( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... )
Expand source code
def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBDatabaseAdmin: """ Create a clone of this AstraDBDatabaseAdmin with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AstraDBDatabaseAdmin instance. Example: >>> admin_for_my_other_db = admin_for_my_db.with_options( ... "https://<ID>-<REGION>.apps.astra.datastax.com", ... ) """ return self._copy( token=token, api_options=api_options, )
class AsyncCollection (*, database: AsyncDatabase, name: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API collection, the object to interact with the Data API for unstructured (schemaless) data, especially for DDL operations. This class has an asynchronous interface for use with asyncio.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_collection
of AsyncDatabase, wherefrom the AsyncCollection inherits its API options such as authentication token and API endpoint.Args
database
- a Database object, instantiated earlier. This represents the database the collection belongs to.
name
- the collection name. This parameter should match an existing collection on the database.
keyspace
- this is the keyspace to which the collection belongs. If nothing is specified, the database's working keyspace is used.
api_options
- a complete specification of the API Options for this instance.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> async_database = client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... )
>>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = await async_database.create_collection( ... "my_events", ... definition=collection_definition, ... )
>>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = await async_database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>>
>>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = await async_database.create_collection( ... "my_events", ... definition=collection_definition_2, ... )
>>> # Get a reference to an existing collection >>> # (no checks are performed on DB) >>> my_collection_3a = async_database.get_collection("my_events") >>> my_collection_3b = async_database.my_events >>> my_collection_3c = async_database["my_events"]
Note
creating an instance of AsyncCollection does not trigger actual creation of the collection on the database. The latter should have been created beforehand, e.g. through the
create_collection
method of an AsyncDatabase.Expand source code
class AsyncCollection(Generic[DOC]): """ A Data API collection, the object to interact with the Data API for unstructured (schemaless) data, especially for DDL operations. This class has an asynchronous interface for use with asyncio. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_collection` of AsyncDatabase, wherefrom the AsyncCollection inherits its API options such as authentication token and API endpoint. Args: database: a Database object, instantiated earlier. This represents the database the collection belongs to. name: the collection name. This parameter should match an existing collection on the database. keyspace: this is the keyspace to which the collection belongs. If nothing is specified, the database's working keyspace is used. api_options: a complete specification of the API Options for this instance. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> async_database = client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... ) >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = await async_database.create_collection( ... "my_events", ... definition=collection_definition, ... ) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = await async_database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = await async_database.create_collection( ... "my_events", ... definition=collection_definition_2, ... ) >>> # Get a reference to an existing collection >>> # (no checks are performed on DB) >>> my_collection_3a = async_database.get_collection("my_events") >>> my_collection_3b = async_database.my_events >>> my_collection_3c = async_database["my_events"] Note: creating an instance of AsyncCollection does not trigger actual creation of the collection on the database. The latter should have been created beforehand, e.g. through the `create_collection` method of an AsyncDatabase. """ def __init__( self, *, database: AsyncDatabase, name: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self._name = name _keyspace = keyspace if keyspace is not None else database.keyspace if _keyspace is None: raise ValueError("Attempted to create Collection with 'keyspace' unset.") self._database = database._copy( keyspace=_keyspace, api_options=self.api_options ) self._commander_headers = { **{DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token()}, **self.api_options.embedding_api_key.get_headers(), **self.api_options.database_additional_headers, } self._api_commander = self._get_api_commander() def __repr__(self) -> str: _db_desc = f'database.api_endpoint="{self.database.api_endpoint}"' return ( f'{self.__class__.__name__}(name="{self.name}", ' f'keyspace="{self.keyspace}", {_db_desc}, ' f"api_options={self.api_options})" ) def __eq__(self, other: Any) -> bool: if isinstance(other, AsyncCollection): return all( [ self._name == other._name, self._database == other._database, self.api_options == other.api_options, ] ) else: return False def __call__(self, *pargs: Any, **kwargs: Any) -> None: raise TypeError( f"'{self.__class__.__name__}' object is not callable. If you " f"meant to call the '{self.name}' method on a " f"'{self.database.__class__.__name__}' object " "it is failing because no such method exists." ) def _get_api_commander(self) -> APICommander: """Instantiate a new APICommander based on the properties of this class.""" if self._database.keyspace is None: raise ValueError( "No keyspace specified. AsyncCollection requires a keyspace to " "be set, e.g. through the `keyspace` constructor parameter." ) base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self._database.api_options.data_api_url_options.api_path, self._database.api_options.data_api_url_options.api_version, self._database.keyspace, self._name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self._database.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, handle_decimals_writes=( self.api_options.serdes_options.use_decimals_in_collections ), handle_decimals_reads=( self.api_options.serdes_options.use_decimals_in_collections ), ) return api_commander async def __aenter__(self: AsyncCollection[DOC]) -> AsyncCollection[DOC]: return self async def __aexit__( self, exc_type: type[BaseException] | None = None, exc_value: BaseException | None = None, traceback: TracebackType | None = None, ) -> None: if self._api_commander is not None: await self._api_commander.__aexit__( exc_type=exc_type, exc_value=exc_value, traceback=traceback, ) async def _converted_request( self, *, http_method: str = HttpMethod.POST, payload: dict[str, Any] | None = None, additional_path: str | None = None, request_params: dict[str, Any] = {}, raise_api_errors: bool = True, timeout_context: _TimeoutContext, ) -> dict[str, Any]: converted_payload = preprocess_collection_payload( payload, options=self.api_options.serdes_options ) raw_response_json = await self._api_commander.async_request( http_method=http_method, payload=converted_payload, additional_path=additional_path, request_params=request_params, raise_api_errors=raise_api_errors, timeout_context=timeout_context, ) response_json = postprocess_collection_response( raw_response_json, options=self.api_options.serdes_options ) return response_json def _copy( self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncCollection( database=self.database, name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def with_options( self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Create a clone of this collection with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AsyncCollection instance. Example: >>> collection_with_api_key_configured = my_async_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, ) def to_sync( self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Create a Collection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into a sync object). Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a Collection instance. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_async_coll.to_sync().count_documents({}, upper_bound=100) 77 """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Collection( database=self.database.to_sync(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) async def options( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDefinition: """ Get the collection options, i.e. its configuration as read from the database. The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a CollectionDefinition instance describing the collection. (See also the database `list_collections` method.) Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.options()) CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine')) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting collections in search of '{self.name}'") self_descriptors = [ coll_desc for coll_desc in await self.database._list_collections_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label, ), ) if coll_desc.name == self.name ] logger.info(f"finished getting collections in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Collection {self.keyspace}.{self.name} not found.", ) async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInfo: """ Information on the collection (name, location, database), in the form of a CollectionInfo object. Not to be confused with the collection `options` method (related to the collection internal configuration). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.info()).database_info.region 'us-east1' >>> asyncio.run(my_async_coll.info()).full_name 'default_keyspace.my_v_collection' Note: the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for `Database.info()` for more details. """ db_info = await self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return CollectionInfo( database_info=db_info, keyspace=self.keyspace, name=self.name, full_name=self.full_name, ) @property def database(self) -> AsyncDatabase: """ a Database object, the database this collection belongs to. Example: >>> my_async_coll.database.name 'the_db' """ return self._database @property def keyspace(self) -> str: """ The keyspace this collection is in. Example: >>> my_async_coll.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The collection's DB is set with keyspace=None") return _keyspace @property def name(self) -> str: """ The name of this collection. Example: >>> my_async_coll.name 'my_v_collection' """ return self._name @property def full_name(self) -> str: """ The fully-qualified collection name within the database, in the form "keyspace.collection_name". Example: >>> my_async_coll.full_name 'default_keyspace.my_v_collection' """ return f"{self.keyspace}.{self.name}" async def insert_one( self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertOneResult: """ Insert a single document in the collection in an atomic operation. Args: document: the dictionary expressing the document to insert. The `_id` field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertOneResult object. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... await acol.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) ... await acol.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) ... count1 = await acol.count_documents({}, upper_bound=10) ... print("count1", count1) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 count1 2 >>> asyncio.run(my_async_coll.insert_one({"tag": v", "$vector": [10, 11]})) CollectionInsertOneResult(...) Note: If an `_id` is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = {"insertOne": {"document": document}} logger.info(f"insertOne on '{self.name}'") io_response = await self._converted_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if io_response["status"]["insertedIds"]: inserted_id = io_response["status"]["insertedIds"][0] return CollectionInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, ) else: raise ValueError( "Could not complete a insert_one operation. " f"(gotten '${json.dumps(io_response)}')" ) else: raise ValueError( "Could not complete a insert_one operation. " f"(gotten '${json.dumps(io_response)}')" ) async def insert_many( self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertManyResult: """ Insert a list of documents into the collection. This is not an atomic operation. Args: documents: an iterable of dictionaries, each a document to insert. Documents may specify their `_id` field or leave it out, in which case it will be added automatically. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster. chunk_size: how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertManyResult object. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... im_result1 = await acol.insert_many( ... [ ... {"a": 10}, ... {"a": 5}, ... {"b": [True, False, False]}, ... ], ... ordered=True, ... ) ... print("inserted1", im_result1.inserted_ids) ... count1 = await acol.count_documents({}, upper_bound=100) ... print("count1", count1) ... await acol.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) ... count2 = await acol.count_documents({}, upper_bound=100) ... print("count2", count2) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 inserted1 ['e3c2a684-...', '1de4949f-...', '167dacc3-...'] count1 3 count2 53 >>> asyncio.run(my_async_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... )) CollectionInsertManyResult(...) Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important. Note: A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an `_id` already present on the collection, or its vector dimension may not match the collection setting. For an ordered insertion, the method will raise an exception at the first such faulty document -- nevertheless, all documents processed until then will end up being written to the database. For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised -- and all insertable documents will have been written to the database, including those "after" the troublesome ones. If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole `insert_many` operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _documents = list(documents) logger.info(f"inserting {len(_documents)} documents in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = await self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids = (chunk_response.get("status") or {}).get( "insertedIds", [] ) inserted_ids += chunk_inserted_ids raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} sem = asyncio.Semaphore(_concurrency) async def concurrent_insert_chunk( document_chunk: list[DOC], ) -> dict[str, Any]: async with sem: im_payload = { "insertMany": { "documents": document_chunk, "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = await self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response if _concurrency > 1: tasks = [ asyncio.create_task( concurrent_insert_chunk(_documents[i : i + _chunk_size]) ) for i in range(0, len(_documents), _chunk_size) ] raw_results = await asyncio.gather(*tasks) else: raw_results = [ await concurrent_insert_chunk(_documents[i : i + _chunk_size]) for i in range(0, len(_documents), _chunk_size) ] # recast raw_results inserted_ids = [ inserted_id for chunk_response in raw_results for inserted_id in (chunk_response.get("status") or {}).get( "insertedIds", [] ) ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncCollectionFindCursor[DOC, DOC]: ... @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2], skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncCollectionFindCursor[DOC, DOC2]: ... def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncCollectionFindCursor[DOC, DOC2]: """ Find documents on the collection, matching a certain provided filter. The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the `find` method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of the `sort` option in a non-vector-search manner. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly an `AsyncCollectionFindCursor[DOC, DOC]`, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: with this integer parameter, what would be the first `skip` documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicit `sort` criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit: this (integer) parameter sets a limit over how many documents are returned. Once `limit` is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. include_sort_vector: a boolean to request the search query vector. If set to True (and if the invocation is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting, as well as the one about upper bounds, for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. request_timeout_ms: a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `request_timeout_ms`. Returns: an AsyncCursor object representing iterations over the matching documents (see the AsyncCursor object for how to use it. The simplest thing is to run a for loop: `for document in collection.sort(...):`). Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_finds(acol: AsyncCollection) -> None: ... filter = {"seq": {"$exists": True}} ... print("find results 1:") ... async for doc in acol.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... async_cursor1 = acol.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... ids = [doc["_id"] async for doc in async_cursor1] ... print("find results 2:", ids) ... async_cursor2 = acol.find({}, limit=3) ... seqs = await async_cursor2.distinct("seq") ... print("distinct results 3:", seqs) ... >>> asyncio.run(run_finds(my_async_coll)) find results 1: 48 35 7 11 13 find results 2: ['d656cd9d-...', '479c7ce8-...', '96dc87fd-...', '83f0a21f-...'] distinct results 3: [48, 35, 7] >>> async def run_vector_finds(acol: AsyncCollection) -> None: ... await acol.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) ... ann_tags = [ ... document["tag"] ... async for document in acol.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] ... return ann_tags ... >>> asyncio.run(run_vector_finds(my_async_coll)) ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE) >>> async_cursor = my_async_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0] >>> asyncio.run(async_cursor.__anext__()) {'_id': 'b13ce177-738e-47ec-bce1-77738ee7ec93', 'tag': 'A'} >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0] Note: The following are example values for the `sort` parameter. When no particular order is required: sort={} When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ a `collections.OrderedDict` in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]} Note: Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as `.distinct()` on a cursor. Note: When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the `find` was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncCollectionFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( AsyncCollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) ) async def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Run a search, returning the first document in the collection that matches provided filters, if any is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the required document, otherwise None. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def demo_find_one(acol: AsyncCollection) -> None: .... print("Count:", await acol.count_documents({}, upper_bound=100)) ... result0 = await acol.find_one({}) ... print("result0", result0) ... result1 = await acol.find_one({"seq": 10}) ... print("result1", result1) ... result2 = await acol.find_one({"seq": 1011}) ... print("result2", result2) ... result3 = await acol.find_one({}, projection={"seq": False}) ... print("result3", result3) ... result4 = await acol.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... print("result4", result4) ... >>> >>> asyncio.run(demo_find_one(my_async_coll)) Count: 50 result0 {'_id': '479c7ce8-...', 'seq': 48} result1 {'_id': '93e992c4-...', 'seq': 10} result2 None result3 {'_id': '479c7ce8-...'} result4 {'_id': 'd656cd9d-...', 'seq': 49} >>> asyncio.run(my_async_coll.find_one( ... {}, ... sort={"$vector": [1, 0]}, ... projection={"*": True}, ... )) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]} Note: See the `find` method for more details on the accepted parameters (whereas `skip` and `limit` are not valid parameters for `find_one`). """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findOne API command.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return fo_response["data"]["document"] # type: ignore[no-any-return] async def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the documents in the collection that match the provided filter. Args: key: the name of the field whose value is inspected across documents. Keys can use dot-notation to descend to deeper document levels. Example of acceptable `key` values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms: a timeout, in milliseconds, for each API request. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the documents that match the filter. The result list has no repeated items. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_distinct(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) ... distinct0 = await acol.distinct("name") ... print("distinct('name')", distinct0) ... distinct1 = await acol.distinct("city") ... print("distinct('city')", distinct1) ... distinct2 = await acol.distinct("food") ... print("distinct('food')", distinct2) ... distinct3 = await acol.distinct("food.1") ... print("distinct('food.1')", distinct3) ... distinct4 = await acol.distinct("food.allergies") ... print("distinct('food.allergies')", distinct4) ... distinct5 = await acol.distinct("food.likes_fruit") ... print("distinct('food.likes_fruit')", distinct5) ... >>> asyncio.run(run_distinct(my_async_coll)) distinct('name') ['Emma', 'Marco'] distinct('city') ['Helsinki'] distinct('food') [{'likes_fruit': True, 'allergies': []}, 'apple', 'orange'] distinct('food.1') ['orange'] distinct('food.allergies') [] distinct('food.likes_fruit') [True] Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required documents using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncCollectionFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: AsyncCollectionFindCursor[dict[str, Any], dict[str, Any]] = ( AsyncCollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") async for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items async def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the documents in the collection matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching documents. Example: >>> async def do_count_docs(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": i} for i in range(20)]) ... count0 = await acol.count_documents({}, upper_bound=100) ... print("count0", count0) ... count1 = await acol.count_documents( ... {"seq":{"$gt": 15}}, upper_bound=100 ... ) ... print("count1", count1) ... count2 = await acol.count_documents({}, upper_bound=10) ... print("count2", count2) ... >>> asyncio.run(do_count_docs(my_async_coll)) count0 20 count1 4 Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = await self._converted_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyDocumentsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyDocumentsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, ) async def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the collection. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.estimated_document_count()) 35700 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = await self._converted_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, ) async def find_one_and_replace( self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document, either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the `return_document` parameter). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_replace( ... acol: AsyncCollection ... ) -> None: ... await acol.insert_one( ... {"_id": "rule1", "text": "all animals are equal"} ... ) ... result0 = await acol.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_replace(my_async_coll)) result0 {'_id': 'rule1', 'text': 'all animals are equal'} result1 {'_id': 'rule1', 'text': 'and the pigs are the rulers'} result2 None result3 {'text': 'F=ma'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, ) async def replace_one( self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Replace a single document on the collection with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the replace operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_replace_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.replace_one( ... {"Marco": {"$exists": True}}, ... {"Buda": "Pest"}, ... ) ... print("result0.update_info", result0.update_info) ... doc1 = await acol.find_one({"Buda": "Pest"}) ... print("doc1", doc1) ... result1 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_replace_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1} doc1 {'_id': '6e669a5a-...', 'Buda': 'Pest'} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '30e34e00-...'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): fo_status = fo_response.get("status") or {} _update_info = _prepare_update_info([fo_status]) return CollectionUpdateResult( raw_results=[fo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, ) async def find_one_and_update( self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and update it as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the `return_document` parameter). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_update(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_update(my_async_coll)) result0 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'Marco': 'Polo'} result1 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'title': 'Mr.', 'rank': 3} result2 None result3 {'_id': 'db3d678d-14d4-4caa-82d2-d5fb77dab7ec', 'name': 'Johnny', 'rank': 0} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndUpdate": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, "projection": normalize_optional_projection(projection), }.items() if v is not None } } logger.info(f"findOneAndUpdate on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndUpdate on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_update API command.", raw_response=fo_response, ) async def update_one( self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Update a single document on the collection as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.update_one( ... {"Marco": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '75748092-...'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = await self._converted_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: uo_status = uo_response["status"] _update_info = _prepare_update_info([uo_status]) return CollectionUpdateResult( raw_results=[uo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, ) async def update_many( self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. upsert: this parameter controls the behavior in absence of matches. If True, a single new document (resulting from applying `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) ... result0 = await acol.update_many( ... {"c": {"$ne": "green"}}, ... {"$set": {"nongreen": True}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_many(my_async_coll)) result0.update_info {'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '79ffd5a3-ab99-4dff-a2a5-4aaa0e59e854'} Note: Similarly to the case of `find` (see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of the `update_many` can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) api_options = { "upsert": upsert, } page_state_options: dict[str, str] = {} um_responses: list[dict[str, Any]] = [] um_statuses: list[dict[str, Any]] = [] must_proceed = True logger.info(f"starting update_many on '{self.name}'") timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) while must_proceed: options = {**api_options, **page_state_options} this_um_payload = { "updateMany": { k: v for k, v in { "filter": filter, "update": update, "options": options, }.items() if v is not None } } logger.info(f"updateMany on '{self.name}'") this_um_response = await self._converted_request( payload=this_um_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished updateMany on '{self.name}'") this_um_status = this_um_response.get("status") or {} # # if errors, quit early if this_um_response.get("errors", []): partial_update_info = _prepare_update_info(um_statuses) partial_result = CollectionUpdateResult( raw_results=um_responses, update_info=partial_update_info, ) all_um_responses = um_responses + [this_um_response] raise CollectionUpdateManyException.from_responses( commands=[None for _ in all_um_responses], raw_responses=all_um_responses, partial_result=partial_result, ) else: if "status" not in this_um_response: raise UnexpectedDataAPIResponseException( text="Faulty response from update_many API command.", raw_response=this_um_response, ) um_responses.append(this_um_response) um_statuses.append(this_um_status) next_page_state = this_um_status.get("nextPageState") if next_page_state is not None: must_proceed = True page_state_options = {"pageState": next_page_state} else: must_proceed = False page_state_options = {} update_info = _prepare_update_info(um_statuses) logger.info(f"finished update_many on '{self.name}'") return CollectionUpdateResult( raw_results=um_responses, update_info=update_info, ) async def find_one_and_delete( self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document in the collection and delete it. The deleted document, however, is the return value of the method. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: Either the document (or a projection thereof, as requested), or None if no matches were found in the first place. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_delete(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) ... delete_result0 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) ... print("delete_result0", delete_result0) ... delete_result1 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... ) ... print("delete_result1", delete_result1) ... >>> asyncio.run(do_find_one_and_delete(my_async_coll)) delete_result0 {'_id': 'f335cd0f-...', 'species': 'swan'} delete_result1 None """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _projection = normalize_optional_projection(projection) fo_payload = { "findOneAndDelete": { k: v for k, v in { "filter": filter, "sort": sort, "projection": _projection, }.items() if v is not None } } logger.info(f"findOneAndDelete on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndDelete on '{self.name}'") if "document" in fo_response.get("data", {}): document = fo_response["data"]["document"] return document # type: ignore[no-any-return] else: deleted_count = fo_response.get("status", {}).get("deletedCount") if deleted_count == 0: return None else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_delete API command.", raw_response=fo_response, ) async def delete_one( self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.insert_many( ... [{"seq": 1}, {"seq": 0}, {"seq": 2}] ... )) CollectionInsertManyResult(...) >>> asyncio.run(my_async_coll.delete_one({"seq": 1})) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0, 2] >>> asyncio.run(my_async_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... )) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0] >>> asyncio.run(my_async_coll.delete_one({"seq": 2})) CollectionDeleteResult(raw_results=..., deleted_count=0) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = { "deleteOne": { k: v for k, v in { "filter": filter, "sort": sort, }.items() if v is not None } } logger.info(f"deleteOne on '{self.name}'") do_response = await self._converted_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if "deletedCount" in do_response.get("status", {}): deleted_count = do_response["status"]["deletedCount"] return CollectionDeleteResult( deleted_count=deleted_count, raw_results=[do_response], ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_one API command.", raw_response=do_response, ) async def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete all documents matching a provided filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. Passing an empty filter, `{}`, completely erases all contents of the collection. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_delete_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) ... delete_result0 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result0.deleted_count", delete_result0.deleted_count) ... distinct1 = await acol.distinct("seq") ... print("distinct1", distinct1) ... delete_result2 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result2.deleted_count", delete_result2.deleted_count) ... >>> asyncio.run(do_delete_many(my_async_coll)) delete_result0.deleted_count 2 distinct1 [2] delete_result2.deleted_count 0 Note: This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the `filter={}` case, whereby the operation is atomic. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) dm_responses: list[dict[str, Any]] = [] deleted_count = 0 must_proceed = True timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) this_dm_payload = {"deleteMany": {"filter": filter}} logger.info(f"starting delete_many on '{self.name}'") while must_proceed: logger.info(f"deleteMany on '{self.name}'") this_dm_response = await self._converted_request( payload=this_dm_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished deleteMany on '{self.name}'") # if errors, quit early if this_dm_response.get("errors", []): partial_result = CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) all_dm_responses = dm_responses + [this_dm_response] raise CollectionDeleteManyException.from_responses( commands=[None for _ in all_dm_responses], raw_responses=all_dm_responses, partial_result=partial_result, ) else: this_dc = this_dm_response.get("status", {}).get("deletedCount") if this_dc is None: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_many API command.", raw_response=this_dm_response, ) dm_responses.append(this_dm_response) deleted_count += this_dc must_proceed = this_dm_response.get("status", {}).get("moreData", False) logger.info(f"finished delete_many on '{self.name}'") return CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) async def drop( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the collection, i.e. delete it from the database along with all the documents it contains. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def drop_and_check(acol: AsyncCollection) -> None: ... doc0 = await acol.find_one({}) ... print("doc0", doc0) ... await acol.drop() ... doc1 = await acol.find_one({}) ... >>> asyncio.run(drop_and_check(my_async_coll)) doc0 {'_id': '...', 'z': -10} Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, ... Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping collection '{self.name}' (self)") await self.database.drop_collection( self.name, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping collection '{self.name}' (self)") async def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.await(my_async_coll.command({"countDocuments": {}})) {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = await self._api_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
Ancestors
- typing.Generic
Instance variables
var database : AsyncDatabase
-
a Database object, the database this collection belongs to.
Example
>>> my_async_coll.database.name 'the_db'
Expand source code
@property def database(self) -> AsyncDatabase: """ a Database object, the database this collection belongs to. Example: >>> my_async_coll.database.name 'the_db' """ return self._database
var full_name : str
-
The fully-qualified collection name within the database, in the form "keyspace.collection_name".
Example
>>> my_async_coll.full_name 'default_keyspace.my_v_collection'
Expand source code
@property def full_name(self) -> str: """ The fully-qualified collection name within the database, in the form "keyspace.collection_name". Example: >>> my_async_coll.full_name 'default_keyspace.my_v_collection' """ return f"{self.keyspace}.{self.name}"
var keyspace : str
-
The keyspace this collection is in.
Example
>>> my_async_coll.keyspace 'default_keyspace'
Expand source code
@property def keyspace(self) -> str: """ The keyspace this collection is in. Example: >>> my_async_coll.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The collection's DB is set with keyspace=None") return _keyspace
var name : str
-
The name of this collection.
Example
>>> my_async_coll.name 'my_v_collection'
Expand source code
@property def name(self) -> str: """ The name of this collection. Example: >>> my_async_coll.name 'my_v_collection' """ return self._name
Methods
async def command(self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.await(my_async_coll.command({"countDocuments": {}})) {'status': {'count': 123}}
Expand source code
async def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.await(my_async_coll.command({"countDocuments": {}})) {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = await self._api_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
async def count_documents(self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Count the documents in the collection matching the specified filter.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
upper_bound
- a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
the exact count of matching documents.
Example
>>> async def do_count_docs(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": i} for i in range(20)]) ... count0 = await acol.count_documents({}, upper_bound=100) ... print("count0", count0) ... count1 = await acol.count_documents( ... {"seq":{"$gt": 15}}, upper_bound=100 ... ) ... print("count1", count1) ... count2 = await acol.count_documents({}, upper_bound=10) ... print("count2", count2) ... >>> asyncio.run(do_count_docs(my_async_coll)) count0 20 count1 4 Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException
Note
Count operations are expensive: for this reason, the best practice is to provide a reasonable
upper_bound
according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered.Expand source code
async def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the documents in the collection matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching documents. Example: >>> async def do_count_docs(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": i} for i in range(20)]) ... count0 = await acol.count_documents({}, upper_bound=100) ... print("count0", count0) ... count1 = await acol.count_documents( ... {"seq":{"$gt": 15}}, upper_bound=100 ... ) ... print("count1", count1) ... count2 = await acol.count_documents({}, upper_bound=10) ... print("count2", count2) ... >>> asyncio.run(do_count_docs(my_async_coll)) count0 20 count1 4 Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = await self._converted_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyDocumentsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyDocumentsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, )
async def delete_many(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDeleteResult
-
Delete all documents matching a provided filter.
Args
filter
- a predicate expressed as a dictionary according to the
Data API filter syntax. Examples are:
{}
{"name": "John"}
{"price": {"$lt": 100}}
{"$and": [{"name": "John"}, {"price": {"$lt": 100}}]}
See the Data API documentation for the full set of operators.
Passing an empty filter,
{}
, completely erases all contents of the collection. general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionDeleteResult object summarizing the outcome of the delete operation.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_delete_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) ... delete_result0 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result0.deleted_count", delete_result0.deleted_count) ... distinct1 = await acol.distinct("seq") ... print("distinct1", distinct1) ... delete_result2 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result2.deleted_count", delete_result2.deleted_count) ... >>> asyncio.run(do_delete_many(my_async_coll)) delete_result0.deleted_count 2 distinct1 [2] delete_result2.deleted_count 0
Note
This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the
filter={}
case, whereby the operation is atomic.Expand source code
async def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete all documents matching a provided filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. Passing an empty filter, `{}`, completely erases all contents of the collection. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_delete_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) ... delete_result0 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result0.deleted_count", delete_result0.deleted_count) ... distinct1 = await acol.distinct("seq") ... print("distinct1", distinct1) ... delete_result2 = await acol.delete_many({"seq": {"$lte": 1}}) ... print("delete_result2.deleted_count", delete_result2.deleted_count) ... >>> asyncio.run(do_delete_many(my_async_coll)) delete_result0.deleted_count 2 distinct1 [2] delete_result2.deleted_count 0 Note: This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the `filter={}` case, whereby the operation is atomic. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) dm_responses: list[dict[str, Any]] = [] deleted_count = 0 must_proceed = True timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) this_dm_payload = {"deleteMany": {"filter": filter}} logger.info(f"starting delete_many on '{self.name}'") while must_proceed: logger.info(f"deleteMany on '{self.name}'") this_dm_response = await self._converted_request( payload=this_dm_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished deleteMany on '{self.name}'") # if errors, quit early if this_dm_response.get("errors", []): partial_result = CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) all_dm_responses = dm_responses + [this_dm_response] raise CollectionDeleteManyException.from_responses( commands=[None for _ in all_dm_responses], raw_responses=all_dm_responses, partial_result=partial_result, ) else: this_dc = this_dm_response.get("status", {}).get("deletedCount") if this_dc is None: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_many API command.", raw_response=this_dm_response, ) dm_responses.append(this_dm_response) deleted_count += this_dc must_proceed = this_dm_response.get("status", {}).get("moreData", False) logger.info(f"finished delete_many on '{self.name}'") return CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, )
async def delete_one(self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDeleteResult
-
Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionDeleteResult object summarizing the outcome of the delete operation.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.insert_many( ... [{"seq": 1}, {"seq": 0}, {"seq": 2}] ... )) CollectionInsertManyResult(...) >>> asyncio.run(my_async_coll.delete_one({"seq": 1})) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0, 2] >>> asyncio.run(my_async_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... )) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0] >>> asyncio.run(my_async_coll.delete_one({"seq": 2})) CollectionDeleteResult(raw_results=..., deleted_count=0)
Expand source code
async def delete_one( self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.insert_many( ... [{"seq": 1}, {"seq": 0}, {"seq": 2}] ... )) CollectionInsertManyResult(...) >>> asyncio.run(my_async_coll.delete_one({"seq": 1})) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0, 2] >>> asyncio.run(my_async_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... )) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> asyncio.run(my_async_coll.distinct("seq")) [0] >>> asyncio.run(my_async_coll.delete_one({"seq": 2})) CollectionDeleteResult(raw_results=..., deleted_count=0) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = { "deleteOne": { k: v for k, v in { "filter": filter, "sort": sort, }.items() if v is not None } } logger.info(f"deleteOne on '{self.name}'") do_response = await self._converted_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if "deletedCount" in do_response.get("status", {}): deleted_count = do_response["status"]["deletedCount"] return CollectionDeleteResult( deleted_count=deleted_count, raw_results=[do_response], ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_one API command.", raw_response=do_response, )
async def distinct(self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[typing.Any]
-
Return a list of the unique values of
key
across the documents in the collection that match the provided filter.Args
key
- the name of the field whose value is inspected across documents.
Keys can use dot-notation to descend to deeper document levels.
Example of acceptable
key
values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
general_method_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation (which may involve multiple API requests).
This method, being based on
find
(see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms
- a timeout, in milliseconds, for each API request.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a list of all different values for
key
found across the documents that match the filter. The result list has no repeated items.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_distinct(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) ... distinct0 = await acol.distinct("name") ... print("distinct('name')", distinct0) ... distinct1 = await acol.distinct("city") ... print("distinct('city')", distinct1) ... distinct2 = await acol.distinct("food") ... print("distinct('food')", distinct2) ... distinct3 = await acol.distinct("food.1") ... print("distinct('food.1')", distinct3) ... distinct4 = await acol.distinct("food.allergies") ... print("distinct('food.allergies')", distinct4) ... distinct5 = await acol.distinct("food.likes_fruit") ... print("distinct('food.likes_fruit')", distinct5) ... >>> asyncio.run(run_distinct(my_async_coll)) distinct('name') ['Emma', 'Marco'] distinct('city') ['Helsinki'] distinct('food') [{'likes_fruit': True, 'allergies': []}, 'apple', 'orange'] distinct('food.1') ['orange'] distinct('food.allergies') [] distinct('food.likes_fruit') [True]
Note
It must be kept in mind that
distinct
is a client-side operation, which effectively browses all required documents using the logic of thefind
method and collects the unique values found forkey
. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large.Note
For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the
find
command.Expand source code
async def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the documents in the collection that match the provided filter. Args: key: the name of the field whose value is inspected across documents. Keys can use dot-notation to descend to deeper document levels. Example of acceptable `key` values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms: a timeout, in milliseconds, for each API request. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the documents that match the filter. The result list has no repeated items. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_distinct(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) ... distinct0 = await acol.distinct("name") ... print("distinct('name')", distinct0) ... distinct1 = await acol.distinct("city") ... print("distinct('city')", distinct1) ... distinct2 = await acol.distinct("food") ... print("distinct('food')", distinct2) ... distinct3 = await acol.distinct("food.1") ... print("distinct('food.1')", distinct3) ... distinct4 = await acol.distinct("food.allergies") ... print("distinct('food.allergies')", distinct4) ... distinct5 = await acol.distinct("food.likes_fruit") ... print("distinct('food.likes_fruit')", distinct5) ... >>> asyncio.run(run_distinct(my_async_coll)) distinct('name') ['Emma', 'Marco'] distinct('city') ['Helsinki'] distinct('food') [{'likes_fruit': True, 'allergies': []}, 'apple', 'orange'] distinct('food.1') ['orange'] distinct('food.allergies') [] distinct('food.likes_fruit') [True] Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required documents using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncCollectionFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: AsyncCollectionFindCursor[dict[str, Any], dict[str, Any]] = ( AsyncCollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") async for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items
async def drop(self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop the collection, i.e. delete it from the database along with all the documents it contains.
Args
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def drop_and_check(acol: AsyncCollection) -> None: ... doc0 = await acol.find_one({}) ... print("doc0", doc0) ... await acol.drop() ... doc1 = await acol.find_one({}) ... >>> asyncio.run(drop_and_check(my_async_coll)) doc0 {'_id': '...', 'z': -10} Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, ...
Note
Use with caution.
Note
Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further.
Expand source code
async def drop( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the collection, i.e. delete it from the database along with all the documents it contains. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def drop_and_check(acol: AsyncCollection) -> None: ... doc0 = await acol.find_one({}) ... print("doc0", doc0) ... await acol.drop() ... doc1 = await acol.find_one({}) ... >>> asyncio.run(drop_and_check(my_async_coll)) doc0 {'_id': '...', 'z': -10} Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, ... Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping collection '{self.name}' (self)") await self.database.drop_collection( self.name, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping collection '{self.name}' (self)")
async def estimated_document_count(self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Query the API server for an estimate of the document count in the collection.
Contrary to
count_documents
, this method has no filtering parameters.Args
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a server-provided estimate count of the documents in the collection.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.estimated_document_count()) 35700
Expand source code
async def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the collection. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.estimated_document_count()) 35700 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = await self._converted_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, )
def find(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AsyncCollectionFindCursor[DOC, DOC2]
-
Find documents on the collection, matching a certain provided filter.
The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the
find
method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of thesort
option in a non-vector-search manner.Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting cursor is implicitly an
AsyncCollectionFindCursor[DOC, DOC]
, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip
- with this integer parameter, what would be the first
skip
documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicitsort
criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit
- this (integer) parameter sets a limit over how many documents
are returned. Once
limit
is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in each
returned document. Can only be used for vector ANN search, i.e.
when either
vector
is supplied or thesort
parameter has the shape {"$vector": …}. include_sort_vector
- a boolean to request the search query vector.
If set to True (and if the invocation is a vector search), calling
the
get_sort_vector
method on the returned cursor will yield the vector used for the ANN search. sort
- with this dictionary parameter one can control the order
the documents are returned. See the Note about sorting, as well as
the one about upper bounds, for details.
Vector-based ANN sorting is achieved by providing a "$vector"
or a "$vectorize" key in
sort
. request_timeout_ms
- a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
request_timeout_ms
.
Returns
- an AsyncCursor object representing iterations over the matching documents
- (see the AsyncCursor object for how to use it. The simplest thing is to
run a for loop
for document in collection.sort(...):
).
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_finds(acol: AsyncCollection) -> None: ... filter = {"seq": {"$exists": True}} ... print("find results 1:") ... async for doc in acol.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... async_cursor1 = acol.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... ids = [doc["_id"] async for doc in async_cursor1] ... print("find results 2:", ids) ... async_cursor2 = acol.find({}, limit=3) ... seqs = await async_cursor2.distinct("seq") ... print("distinct results 3:", seqs) ... >>> asyncio.run(run_finds(my_async_coll)) find results 1: 48 35 7 11 13 find results 2: ['d656cd9d-...', '479c7ce8-...', '96dc87fd-...', '83f0a21f-...'] distinct results 3: [48, 35, 7]
>>> async def run_vector_finds(acol: AsyncCollection) -> None: ... await acol.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) ... ann_tags = [ ... document["tag"] ... async for document in acol.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] ... return ann_tags ... >>> asyncio.run(run_vector_finds(my_async_coll)) ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE)
>>> async_cursor = my_async_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0] >>> asyncio.run(async_cursor.__anext__()) {'_id': 'b13ce177-738e-47ec-bce1-77738ee7ec93', 'tag': 'A'} >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0]
Note
The following are example values for the
sort
parameter. When no particular order is required: sort={} When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ acollections.OrderedDict
in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]}Note
Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as
.distinct()
on a cursor.Note
When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the
find
was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor.Expand source code
def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncCollectionFindCursor[DOC, DOC2]: """ Find documents on the collection, matching a certain provided filter. The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the `find` method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of the `sort` option in a non-vector-search manner. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly an `AsyncCollectionFindCursor[DOC, DOC]`, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: with this integer parameter, what would be the first `skip` documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicit `sort` criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit: this (integer) parameter sets a limit over how many documents are returned. Once `limit` is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. include_sort_vector: a boolean to request the search query vector. If set to True (and if the invocation is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting, as well as the one about upper bounds, for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. request_timeout_ms: a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `request_timeout_ms`. Returns: an AsyncCursor object representing iterations over the matching documents (see the AsyncCursor object for how to use it. The simplest thing is to run a for loop: `for document in collection.sort(...):`). Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def run_finds(acol: AsyncCollection) -> None: ... filter = {"seq": {"$exists": True}} ... print("find results 1:") ... async for doc in acol.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... async_cursor1 = acol.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... ids = [doc["_id"] async for doc in async_cursor1] ... print("find results 2:", ids) ... async_cursor2 = acol.find({}, limit=3) ... seqs = await async_cursor2.distinct("seq") ... print("distinct results 3:", seqs) ... >>> asyncio.run(run_finds(my_async_coll)) find results 1: 48 35 7 11 13 find results 2: ['d656cd9d-...', '479c7ce8-...', '96dc87fd-...', '83f0a21f-...'] distinct results 3: [48, 35, 7] >>> async def run_vector_finds(acol: AsyncCollection) -> None: ... await acol.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) ... ann_tags = [ ... document["tag"] ... async for document in acol.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] ... return ann_tags ... >>> asyncio.run(run_vector_finds(my_async_coll)) ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE) >>> async_cursor = my_async_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0] >>> asyncio.run(async_cursor.__anext__()) {'_id': 'b13ce177-738e-47ec-bce1-77738ee7ec93', 'tag': 'A'} >>> asyncio.run(async_cursor.get_sort_vector()) [3.0, 3.0] Note: The following are example values for the `sort` parameter. When no particular order is required: sort={} When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ a `collections.OrderedDict` in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]} Note: Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as `.distinct()` on a cursor. Note: When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the `find` was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncCollectionFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( AsyncCollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) )
async def find_one(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Run a search, returning the first document in the collection that matches provided filters, if any is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in the
returned document. Can only be used for vector ANN search, i.e.
when either
vector
is supplied or thesort
parameter has the shape {"$vector": …}. sort
- with this dictionary parameter one can control the order
the documents are returned. See the Note about sorting for details.
Vector-based ANN sorting is achieved by providing a "$vector"
or a "$vectorize" key in
sort
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary expressing the required document, otherwise None.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def demo_find_one(acol: AsyncCollection) -> None: .... print("Count:", await acol.count_documents({}, upper_bound=100)) ... result0 = await acol.find_one({}) ... print("result0", result0) ... result1 = await acol.find_one({"seq": 10}) ... print("result1", result1) ... result2 = await acol.find_one({"seq": 1011}) ... print("result2", result2) ... result3 = await acol.find_one({}, projection={"seq": False}) ... print("result3", result3) ... result4 = await acol.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... print("result4", result4) ... >>> >>> asyncio.run(demo_find_one(my_async_coll)) Count: 50 result0 {'_id': '479c7ce8-...', 'seq': 48} result1 {'_id': '93e992c4-...', 'seq': 10} result2 None result3 {'_id': '479c7ce8-...'} result4 {'_id': 'd656cd9d-...', 'seq': 49}
>>> asyncio.run(my_async_coll.find_one( ... {}, ... sort={"$vector": [1, 0]}, ... projection={"*": True}, ... )) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]}
Note
See the
find
method for more details on the accepted parameters (whereasskip
andlimit
are not valid parameters forfind_one
).Expand source code
async def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Run a search, returning the first document in the collection that matches provided filters, if any is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the required document, otherwise None. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def demo_find_one(acol: AsyncCollection) -> None: .... print("Count:", await acol.count_documents({}, upper_bound=100)) ... result0 = await acol.find_one({}) ... print("result0", result0) ... result1 = await acol.find_one({"seq": 10}) ... print("result1", result1) ... result2 = await acol.find_one({"seq": 1011}) ... print("result2", result2) ... result3 = await acol.find_one({}, projection={"seq": False}) ... print("result3", result3) ... result4 = await acol.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) ... print("result4", result4) ... >>> >>> asyncio.run(demo_find_one(my_async_coll)) Count: 50 result0 {'_id': '479c7ce8-...', 'seq': 48} result1 {'_id': '93e992c4-...', 'seq': 10} result2 None result3 {'_id': '479c7ce8-...'} result4 {'_id': 'd656cd9d-...', 'seq': 49} >>> asyncio.run(my_async_coll.find_one( ... {}, ... sort={"$vector": [1, 0]}, ... projection={"*": True}, ... )) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]} Note: See the `find` method for more details on the accepted parameters (whereas `skip` and `limit` are not valid parameters for `find_one`). """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findOne API command.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return fo_response["data"]["document"] # type: ignore[no-any-return]
async def find_one_and_delete(self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document in the collection and delete it. The deleted document, however, is the return value of the method.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
Either the document (or a projection thereof, as requested), or None if no matches were found in the first place.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_delete(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) ... delete_result0 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) ... print("delete_result0", delete_result0) ... delete_result1 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... ) ... print("delete_result1", delete_result1) ... >>> asyncio.run(do_find_one_and_delete(my_async_coll)) delete_result0 {'_id': 'f335cd0f-...', 'species': 'swan'} delete_result1 None
Expand source code
async def find_one_and_delete( self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document in the collection and delete it. The deleted document, however, is the return value of the method. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: Either the document (or a projection thereof, as requested), or None if no matches were found in the first place. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_delete(acol: AsyncCollection) -> None: ... await acol.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) ... delete_result0 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) ... print("delete_result0", delete_result0) ... delete_result1 = await acol.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... ) ... print("delete_result1", delete_result1) ... >>> asyncio.run(do_find_one_and_delete(my_async_coll)) delete_result0 {'_id': 'f335cd0f-...', 'species': 'swan'} delete_result1 None """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _projection = normalize_optional_projection(projection) fo_payload = { "findOneAndDelete": { k: v for k, v in { "filter": filter, "sort": sort, "projection": _projection, }.items() if v is not None } } logger.info(f"findOneAndDelete on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndDelete on '{self.name}'") if "document" in fo_response.get("data", {}): document = fo_response["data"]["document"] return document # type: ignore[no-any-return] else: deleted_count = fo_response.get("status", {}).get("deletedCount") if deleted_count == 0: return None else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_delete API command.", raw_response=fo_response, )
async def find_one_and_replace(self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = 'before', general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
replacement
- the new document to write into the collection.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True,
replacement
is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document
- a flag controlling what document is returned:
if set to
ReturnDocument.BEFORE
, or the string "before", the document found on database is returned; if set toReturnDocument.AFTER
, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
A document, either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the
return_document
parameter).Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_replace( ... acol: AsyncCollection ... ) -> None: ... await acol.insert_one( ... {"_id": "rule1", "text": "all animals are equal"} ... ) ... result0 = await acol.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_replace(my_async_coll)) result0 {'_id': 'rule1', 'text': 'all animals are equal'} result1 {'_id': 'rule1', 'text': 'and the pigs are the rulers'} result2 None result3 {'text': 'F=ma'}
Expand source code
async def find_one_and_replace( self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document, either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the `return_document` parameter). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_replace( ... acol: AsyncCollection ... ) -> None: ... await acol.insert_one( ... {"_id": "rule1", "text": "all animals are equal"} ... ) ... result0 = await acol.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_replace(my_async_coll)) result0 {'_id': 'rule1', 'text': 'all animals are equal'} result1 {'_id': 'rule1', 'text': 'and the pigs are the rulers'} result2 None result3 {'text': 'F=ma'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, )
async def find_one_and_update(self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = 'before', general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document on the collection and update it as requested, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True, a new document (resulting from applying the
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document
- a flag controlling what document is returned:
if set to
ReturnDocument.BEFORE
, or the string "before", the document found on database is returned; if set toReturnDocument.AFTER
, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the
return_document
parameter).Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_update(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_update(my_async_coll)) result0 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'Marco': 'Polo'} result1 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'title': 'Mr.', 'rank': 3} result2 None result3 {'_id': 'db3d678d-14d4-4caa-82d2-d5fb77dab7ec', 'name': 'Johnny', 'rank': 0}
Expand source code
async def find_one_and_update( self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and update it as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the `return_document` parameter). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_find_one_and_update(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) ... print("result0", result0) ... result1 = await acol.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result1", result1) ... result2 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result2", result2) ... result3 = await acol.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) ... print("result3", result3) ... >>> asyncio.run(do_find_one_and_update(my_async_coll)) result0 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'Marco': 'Polo'} result1 {'_id': 'f7c936d3-b0a0-45eb-a676-e2829662a57c', 'title': 'Mr.', 'rank': 3} result2 None result3 {'_id': 'db3d678d-14d4-4caa-82d2-d5fb77dab7ec', 'name': 'Johnny', 'rank': 0} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndUpdate": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, "projection": normalize_optional_projection(projection), }.items() if v is not None } } logger.info(f"findOneAndUpdate on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndUpdate on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_update API command.", raw_response=fo_response, )
async def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInfo
-
Information on the collection (name, location, database), in the form of a CollectionInfo object.
Not to be confused with the collection
options
method (related to the collection internal configuration).Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.info()).database_info.region 'us-east1' >>> asyncio.run(my_async_coll.info()).full_name 'default_keyspace.my_v_collection'
Note
the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for
Database.info()
for more details.Expand source code
async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInfo: """ Information on the collection (name, location, database), in the form of a CollectionInfo object. Not to be confused with the collection `options` method (related to the collection internal configuration). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.info()).database_info.region 'us-east1' >>> asyncio.run(my_async_coll.info()).full_name 'default_keyspace.my_v_collection' Note: the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for `Database.info()` for more details. """ db_info = await self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return CollectionInfo( database_info=db_info, keyspace=self.keyspace, name=self.name, full_name=self.full_name, )
async def insert_many(self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInsertManyResult
-
Insert a list of documents into the collection. This is not an atomic operation.
Args
documents
- an iterable of dictionaries, each a document to insert.
Documents may specify their
_id
field or leave it out, in which case it will be added automatically. ordered
- if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster.
chunk_size
- how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default.
concurrency
- maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionInsertManyResult object.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... im_result1 = await acol.insert_many( ... [ ... {"a": 10}, ... {"a": 5}, ... {"b": [True, False, False]}, ... ], ... ordered=True, ... ) ... print("inserted1", im_result1.inserted_ids) ... count1 = await acol.count_documents({}, upper_bound=100) ... print("count1", count1) ... await acol.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) ... count2 = await acol.count_documents({}, upper_bound=100) ... print("count2", count2) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 inserted1 ['e3c2a684-...', '1de4949f-...', '167dacc3-...'] count1 3 count2 53 >>> asyncio.run(my_async_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... )) CollectionInsertManyResult(...)
Note
Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important.
Note
A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an
_id
already present on the collection, or its vector dimension may not match the collection setting.For an ordered insertion, the method will raise an exception at the first such faulty document – nevertheless, all documents processed until then will end up being written to the database.
For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised – and all insertable documents will have been written to the database, including those "after" the troublesome ones.
If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole
insert_many
operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database.Expand source code
async def insert_many( self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertManyResult: """ Insert a list of documents into the collection. This is not an atomic operation. Args: documents: an iterable of dictionaries, each a document to insert. Documents may specify their `_id` field or leave it out, in which case it will be added automatically. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster. chunk_size: how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertManyResult object. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... im_result1 = await acol.insert_many( ... [ ... {"a": 10}, ... {"a": 5}, ... {"b": [True, False, False]}, ... ], ... ordered=True, ... ) ... print("inserted1", im_result1.inserted_ids) ... count1 = await acol.count_documents({}, upper_bound=100) ... print("count1", count1) ... await acol.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) ... count2 = await acol.count_documents({}, upper_bound=100) ... print("count2", count2) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 inserted1 ['e3c2a684-...', '1de4949f-...', '167dacc3-...'] count1 3 count2 53 >>> asyncio.run(my_async_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... )) CollectionInsertManyResult(...) Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important. Note: A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an `_id` already present on the collection, or its vector dimension may not match the collection setting. For an ordered insertion, the method will raise an exception at the first such faulty document -- nevertheless, all documents processed until then will end up being written to the database. For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised -- and all insertable documents will have been written to the database, including those "after" the troublesome ones. If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole `insert_many` operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _documents = list(documents) logger.info(f"inserting {len(_documents)} documents in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = await self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids = (chunk_response.get("status") or {}).get( "insertedIds", [] ) inserted_ids += chunk_inserted_ids raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} sem = asyncio.Semaphore(_concurrency) async def concurrent_insert_chunk( document_chunk: list[DOC], ) -> dict[str, Any]: async with sem: im_payload = { "insertMany": { "documents": document_chunk, "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = await self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response if _concurrency > 1: tasks = [ asyncio.create_task( concurrent_insert_chunk(_documents[i : i + _chunk_size]) ) for i in range(0, len(_documents), _chunk_size) ] raw_results = await asyncio.gather(*tasks) else: raw_results = [ await concurrent_insert_chunk(_documents[i : i + _chunk_size]) for i in range(0, len(_documents), _chunk_size) ] # recast raw_results inserted_ids = [ inserted_id for chunk_response in raw_results for inserted_id in (chunk_response.get("status") or {}).get( "insertedIds", [] ) ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result
async def insert_one(self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInsertOneResult
-
Insert a single document in the collection in an atomic operation.
Args
document
- the dictionary expressing the document to insert.
The
_id
field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionInsertOneResult object.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... await acol.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) ... await acol.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) ... count1 = await acol.count_documents({}, upper_bound=10) ... print("count1", count1) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 count1 2
>>> asyncio.run(my_async_coll.insert_one({"tag": v", "$vector": [10, 11]})) CollectionInsertOneResult(...)
Note
If an
_id
is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails.Expand source code
async def insert_one( self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertOneResult: """ Insert a single document in the collection in an atomic operation. Args: document: the dictionary expressing the document to insert. The `_id` field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertOneResult object. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def write_and_count(acol: AsyncCollection) -> None: ... count0 = await acol.count_documents({}, upper_bound=10) ... print("count0", count0) ... await acol.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) ... await acol.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) ... count1 = await acol.count_documents({}, upper_bound=10) ... print("count1", count1) ... >>> asyncio.run(write_and_count(my_async_coll)) count0 0 count1 2 >>> asyncio.run(my_async_coll.insert_one({"tag": v", "$vector": [10, 11]})) CollectionInsertOneResult(...) Note: If an `_id` is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = {"insertOne": {"document": document}} logger.info(f"insertOne on '{self.name}'") io_response = await self._converted_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if io_response["status"]["insertedIds"]: inserted_id = io_response["status"]["insertedIds"][0] return CollectionInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, ) else: raise ValueError( "Could not complete a insert_one operation. " f"(gotten '${json.dumps(io_response)}')" ) else: raise ValueError( "Could not complete a insert_one operation. " f"(gotten '${json.dumps(io_response)}')" )
async def options(self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDefinition
-
Get the collection options, i.e. its configuration as read from the database.
The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application.
Args
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a CollectionDefinition instance describing the collection. (See also the database
list_collections
method.)Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.options()) CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine'))
Expand source code
async def options( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDefinition: """ Get the collection options, i.e. its configuration as read from the database. The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a CollectionDefinition instance describing the collection. (See also the database `list_collections` method.) Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_coll.options()) CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine')) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting collections in search of '{self.name}'") self_descriptors = [ coll_desc for coll_desc in await self.database._list_collections_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label, ), ) if coll_desc.name == self.name ] logger.info(f"finished getting collections in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Collection {self.keyspace}.{self.name} not found.", )
async def replace_one(self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Replace a single document on the collection with a new one, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
replacement
- the new document to write into the collection.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True,
replacement
is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the replace operation.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_replace_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.replace_one( ... {"Marco": {"$exists": True}}, ... {"Buda": "Pest"}, ... ) ... print("result0.update_info", result0.update_info) ... doc1 = await acol.find_one({"Buda": "Pest"}) ... print("doc1", doc1) ... result1 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_replace_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1} doc1 {'_id': '6e669a5a-...', 'Buda': 'Pest'} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '30e34e00-...'}
Expand source code
async def replace_one( self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Replace a single document on the collection with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the replace operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_replace_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.replace_one( ... {"Marco": {"$exists": True}}, ... {"Buda": "Pest"}, ... ) ... print("result0.update_info", result0.update_info) ... doc1 = await acol.find_one({"Buda": "Pest"}) ... print("doc1", doc1) ... result1 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.replace_one( ... {"Mirco": {"$exists": True}}, ... {"Oh": "yeah?"}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_replace_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1} doc1 {'_id': '6e669a5a-...', 'Buda': 'Pest'} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '30e34e00-...'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = await self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): fo_status = fo_response.get("status") or {} _update_info = _prepare_update_info([fo_status]) return CollectionUpdateResult( raw_results=[fo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, )
def to_sync(self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Collection[DOC]
-
Create a Collection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into a sync object).
Args
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, a Collection instance.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> my_async_coll.to_sync().count_documents({}, upper_bound=100) 77
Expand source code
def to_sync( self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Create a Collection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into a sync object). Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a Collection instance. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_async_coll.to_sync().count_documents({}, upper_bound=100) 77 """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Collection( database=self.database.to_sync(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, )
async def update_many(self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
upsert
- this parameter controls the behavior in absence of matches.
If True, a single new document (resulting from applying
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the update operation.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) ... result0 = await acol.update_many( ... {"c": {"$ne": "green"}}, ... {"$set": {"nongreen": True}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_many(my_async_coll)) result0.update_info {'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '79ffd5a3-ab99-4dff-a2a5-4aaa0e59e854'}
Note
Similarly to the case of
find
(see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of theupdate_many
can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not.Expand source code
async def update_many( self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. upsert: this parameter controls the behavior in absence of matches. If True, a single new document (resulting from applying `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_many(acol: AsyncCollection) -> None: ... await acol.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) ... result0 = await acol.update_many( ... {"c": {"$ne": "green"}}, ... {"$set": {"nongreen": True}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_many(my_async_coll)) result0.update_info {'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2} result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0} result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '79ffd5a3-ab99-4dff-a2a5-4aaa0e59e854'} Note: Similarly to the case of `find` (see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of the `update_many` can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) api_options = { "upsert": upsert, } page_state_options: dict[str, str] = {} um_responses: list[dict[str, Any]] = [] um_statuses: list[dict[str, Any]] = [] must_proceed = True logger.info(f"starting update_many on '{self.name}'") timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) while must_proceed: options = {**api_options, **page_state_options} this_um_payload = { "updateMany": { k: v for k, v in { "filter": filter, "update": update, "options": options, }.items() if v is not None } } logger.info(f"updateMany on '{self.name}'") this_um_response = await self._converted_request( payload=this_um_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished updateMany on '{self.name}'") this_um_status = this_um_response.get("status") or {} # # if errors, quit early if this_um_response.get("errors", []): partial_update_info = _prepare_update_info(um_statuses) partial_result = CollectionUpdateResult( raw_results=um_responses, update_info=partial_update_info, ) all_um_responses = um_responses + [this_um_response] raise CollectionUpdateManyException.from_responses( commands=[None for _ in all_um_responses], raw_responses=all_um_responses, partial_result=partial_result, ) else: if "status" not in this_um_response: raise UnexpectedDataAPIResponseException( text="Faulty response from update_many API command.", raw_response=this_um_response, ) um_responses.append(this_um_response) um_statuses.append(this_um_status) next_page_state = this_um_status.get("nextPageState") if next_page_state is not None: must_proceed = True page_state_options = {"pageState": next_page_state} else: must_proceed = False page_state_options = {} update_info = _prepare_update_info(um_statuses) logger.info(f"finished update_many on '{self.name}'") return CollectionUpdateResult( raw_results=um_responses, update_info=update_info, )
async def update_one(self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Update a single document on the collection as requested, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True, a new document (resulting from applying the
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the update operation.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.update_one( ... {"Marco": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '75748092-...'}
Expand source code
async def update_one( self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Update a single document on the collection as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def do_update_one(acol: AsyncCollection) -> None: ... await acol.insert_one({"Marco": "Polo"}) ... result0 = await acol.update_one( ... {"Marco": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result0.update_info", result0.update_info) ... result1 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... ) ... print("result1.update_info", result1.update_info) ... result2 = await acol.update_one( ... {"Mirko": {"$exists": True}}, ... {"$inc": {"rank": 3}}, ... upsert=True, ... ) ... print("result2.update_info", result2.update_info) ... >>> asyncio.run(do_update_one(my_async_coll)) result0.update_info {'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) result1.update_info {'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) result2.update_info {'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '75748092-...'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = await self._converted_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: uo_status = uo_response["status"] _update_info = _prepare_update_info([uo_status]) return CollectionUpdateResult( raw_results=[uo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, )
def with_options(self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncCollection[DOC]
-
Create a clone of this collection with some changed attributes.
Args
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new AsyncCollection instance.
Example
>>> collection_with_api_key_configured = my_async_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... )
Expand source code
def with_options( self: AsyncCollection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Create a clone of this collection with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AsyncCollection instance. Example: >>> collection_with_api_key_configured = my_async_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, )
class AsyncDatabase (*, api_endpoint: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API database. This is the object for doing database-level DML, such as creating/deleting collections, and for obtaining Collection objects themselves. This class has an asynchronous interface.
This class is not meant for direct instantiation by the user, rather it is usually obtained by invoking methods such as
get_async_database
of AstraDBClient.On Astra DB, an AsyncDatabase comes with an "API Endpoint", which implies an AsyncDatabase object instance reaches a specific region (relevant point in case of multi-region databases).
An AsyncDatabase is also always set with a "working keyspace" on which all data operations are done (unless otherwise specified).
Args
api_endpoint
- the full "API Endpoint" string used to reach the Data API.
Example: "https://
- .apps.astra.datastax.com" keyspace
- this is the keyspace all method calls will target, unless
one is explicitly specified in the call. If no keyspace is supplied
when creating a Database, on Astra DB the name "default_keyspace" is set,
while on other environments the keyspace is left unspecified: in this case,
most operations are unavailable until a keyspace is set (through an explicit
use_keyspace
invocation or equivalent). api_options
- a complete specification of the API Options for this instance.
Example
>>> from astrapy import DataAPIClient >>> my_client = astrapy.DataAPIClient() >>> my_db = my_client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... )
Note
creating an instance of AsyncDatabase does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class.
Expand source code
class AsyncDatabase: """ A Data API database. This is the object for doing database-level DML, such as creating/deleting collections, and for obtaining Collection objects themselves. This class has an asynchronous interface. This class is not meant for direct instantiation by the user, rather it is usually obtained by invoking methods such as `get_async_database` of AstraDBClient. On Astra DB, an AsyncDatabase comes with an "API Endpoint", which implies an AsyncDatabase object instance reaches a specific region (relevant point in case of multi-region databases). An AsyncDatabase is also always set with a "working keyspace" on which all data operations are done (unless otherwise specified). Args: api_endpoint: the full "API Endpoint" string used to reach the Data API. Example: "https://<database_id>-<region>.apps.astra.datastax.com" keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, on Astra DB the name "default_keyspace" is set, while on other environments the keyspace is left unspecified: in this case, most operations are unavailable until a keyspace is set (through an explicit `use_keyspace` invocation or equivalent). api_options: a complete specification of the API Options for this instance. Example: >>> from astrapy import DataAPIClient >>> my_client = astrapy.DataAPIClient() >>> my_db = my_client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... ) Note: creating an instance of AsyncDatabase does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ def __init__( self, *, api_endpoint: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self.api_endpoint = api_endpoint.strip("/") # enforce defaults if on Astra DB: self._using_keyspace: str | None if ( keyspace is None and self.api_options.environment in Environment.astra_db_values ): self._using_keyspace = DEFAULT_ASTRA_DB_KEYSPACE else: self._using_keyspace = keyspace self._commander_headers = { DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token(), **self.api_options.database_additional_headers, } self._name: str | None = None self._api_commander = self._get_api_commander(keyspace=self.keyspace) def __getattr__(self, collection_name: str) -> AsyncCollection[DefaultDocumentType]: return self.get_collection(name=collection_name) def __getitem__(self, collection_name: str) -> AsyncCollection[DefaultDocumentType]: return self.get_collection(name=collection_name) def __repr__(self) -> str: ep_desc = f'api_endpoint="{self.api_endpoint}"' keyspace_desc: str | None if self._using_keyspace is None: keyspace_desc = "keyspace not set" else: keyspace_desc = f'keyspace="{self._using_keyspace}"' api_options_desc = f"api_options={self.api_options}" parts = [ pt for pt in [ep_desc, keyspace_desc, api_options_desc] if pt is not None ] return f"{self.__class__.__name__}({', '.join(parts)})" def __eq__(self, other: Any) -> bool: if isinstance(other, AsyncDatabase): return all( [ self.api_endpoint == other.api_endpoint, self.keyspace == other.keyspace, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self, keyspace: str | None) -> APICommander | None: """ Instantiate a new APICommander based on the properties of this class and a provided keyspace. If keyspace is None, return None (signaling a "keyspace not set"). """ if keyspace is None: return None else: base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, keyspace, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) return api_commander def _get_driver_commander(self, keyspace: str | None) -> APICommander: """ Building on _get_api_commander, fall back to class keyspace in creating/returning a commander, and in any case raise an error if not set. """ driver_commander: APICommander | None if keyspace: driver_commander = self._get_api_commander(keyspace=keyspace) else: driver_commander = self._api_commander if driver_commander is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return driver_commander async def __aenter__(self) -> AsyncDatabase: return self async def __aexit__( self, exc_type: type[BaseException] | None = None, exc_value: BaseException | None = None, traceback: TracebackType | None = None, ) -> None: if self._api_commander is not None: await self._api_commander.__aexit__( exc_type=exc_type, exc_value=exc_value, traceback=traceback, ) def _copy( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncDatabase( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, ) def with_options( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create a clone of this database with some changed attributes. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new `AsyncDatabase` instance. Example: >>> async_database_2 = async_database.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... ) """ return self._copy( keyspace=keyspace, token=token, api_options=api_options, ) def to_sync( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a (synchronous) Database from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: "AstraCS:xyz..." This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a `Database` instance. Example: >>> my_sync_db = async_database.to_sync() >>> my_sync_db.list_collection_names() ['a_collection', 'another_collection'] """ arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Database( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, ) def use_keyspace(self, keyspace: str) -> None: """ Switch to a new working keyspace for this database. This method changes (mutates) the AsyncDatabase instance. Note that this method does not create the keyspace, which should exist already (created for instance with a `DatabaseAdmin.async_create_keyspace` call). Args: keyspace: the new keyspace to use as the database working keyspace. Returns: None. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['coll_1', 'coll_2'] >>> async_database.use_keyspace("an_empty_keyspace") >>> asyncio.run(async_database.list_collection_names()) [] """ logger.info(f"switching to keyspace '{keyspace}'") self._using_keyspace = keyspace self._api_commander = self._get_api_commander(keyspace=self.keyspace) async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBDatabaseInfo: """ Additional information on the database as a AstraDBDatabaseInfo instance. Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.info()).region 'eu-west-1' >>> asyncio.run( ... async_database.info() ... ).raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z' Note: see the AstraDBDatabaseInfo documentation for a caveat about the difference between the `region` and the `raw["region"]` attributes. """ if self.api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting database info") database_info = await async_fetch_database_info( self.api_endpoint, keyspace=self.keyspace, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if database_info is not None: logger.info("finished getting database info") return database_info else: raise DevOpsAPIException("Failure while fetching database info.") @property def id(self) -> str: """ The ID of this database. Example: >>> my_async_database.id '01234567-89ab-cdef-0123-456789abcdef' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.database_id else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." ) @property def region(self) -> str: """ The region where this database is located. The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint). Example: >>> my_async_database.region 'us-west-2' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.region else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." ) async def name(self) -> str: """ The name of this database. Note that this bears no unicity guarantees. Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the `info()` method for more details. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.name()) 'the_application_database' """ if self._name is None: self._name = (await self.info()).name return self._name @property def keyspace(self) -> str | None: """ The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified. Returns: the working keyspace (a string), or None if not set. Example: >>> async_database.keyspace 'the_keyspace' """ return self._using_keyspace @overload def get_collection( self, name: str, *, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DefaultDocumentType]: ... @overload def get_collection( self, name: str, *, document_type: type[DOC], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: ... def get_collection( self, name: str, *, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Spawn an `AsyncCollection` object instance representing a collection on this database. Creating an `AsyncCollection` instance does not have any effect on the actual state of the database: in other words, for the created `AsyncCollection` instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling the `create_collection` method). Args: name: the name of the collection. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncCollection is implicitly an `AsyncCollection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the collection. If no keyspace is specified, the setting for this database is used. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncCollection` instance, representing the desired collection (but without any form of validation). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def count_docs(adb: AsyncDatabase, c_name: str) -> int: ... async_col = adb.get_collection(c_name) ... return await async_col.count_documents({}, upper_bound=100) ... >>> asyncio.run(count_docs(async_database, "my_collection")) 45 Note: the attribute and indexing syntax forms achieve the same effect as this method, returning an AsyncCollection. In other words, the following are equivalent: async_database.get_collection("coll_name") async_database.coll_name async_database["coll_name"] """ # lazy importing here against circular-import error from astrapy.collection import AsyncCollection resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return AsyncCollection( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, ) @overload async def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DefaultDocumentType]: ... @overload async def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[DOC], keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: ... async def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Creates a collection on the database and return the AsyncCollection instance that represents it. This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the `get_collection` instance, this method triggers causes the collection to be actually created on DB. Args: name: the name of the collection. definition: a complete collection definition for the table. This can be an instance of `CollectionDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CollectionDefinition`. See the `astrapy.info.CollectionDefinition` class and the `AsyncCollection` class for more details and ways to construct this object. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncCollection is implicitly an `AsyncCollection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the collection is to be created. If not specified, the general setting for this database is used. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncCollection` instance, representing the newly-created collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition, ... )) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_1, ... )) >>> >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_2, ... )) """ cc_definition: dict[str, Any] = CollectionDefinition.coerce( definition or {} ).as_dict() if collection_admin_timeout_ms is not None: _collection_admin_timeout_ms = collection_admin_timeout_ms _ca_label = "collection_admin_timeout_ms" else: _collection_admin_timeout_ms = ( self.api_options.timeout_options.collection_admin_timeout_ms ) _ca_label = "collection_admin_timeout_ms" driver_commander = self._get_driver_commander(keyspace=keyspace) cc_payload = { "createCollection": { k: v for k, v in { "name": name, "options": cc_definition, }.items() if v is not None if v != {} } } logger.info(f"createCollection('{name}')") cc_response = await driver_commander.async_request( payload=cc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if cc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createCollection API command.", raw_response=cc_response, ) logger.info(f"finished createCollection('{name}')") return self.get_collection( name, document_type=document_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, ) async def drop_collection( self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop a collection from the database, along with all documents therein. Args: name: the name of the collection to drop. keyspace: the keyspace where the collection resides. If not specified, the database working keyspace is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'my_v_col', 'another_col'] >>> asyncio.run(async_database.drop_collection("my_v_col")) >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace driver_commander = self._get_driver_commander(keyspace=_keyspace) dc_payload = {"deleteCollection": {"name": name}} logger.info(f"deleteCollection('{name}')") dc_response = await driver_commander.async_request( payload=dc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if dc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteCollection API command.", raw_response=dc_response, ) logger.info(f"finished deleteCollection('{name}')") return dc_response.get("status", {}) # type: ignore[no-any-return] async def list_collections( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[CollectionDescriptor]: """ List all collections in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of CollectionDescriptor instances one for each collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def a_list_colls(adb: AsyncDatabase) -> None: ... a_coll_list = await adb.list_collections() ... print("* list:", a_coll_list) ... for coll in await adb.list_collections(): ... print("* coll:", coll) ... >>> asyncio.run(a_list_colls(async_database)) * list: [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] * coll: CollectionDescriptor(name='my_v_col', options=CollectionDefinition()) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._list_collections_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) async def _list_collections_ctx( self, *, keyspace: str | None, timeout_context: _TimeoutContext, ) -> list[CollectionDescriptor]: driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload = {"findCollections": {"options": {"explain": True}}} logger.info("findCollections") gc_response = await driver_commander.async_request( payload=gc_payload, timeout_context=timeout_context, ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: # we know this is a list of dicts, to marshal into "descriptors" logger.info("finished findCollections") return [ CollectionDescriptor._from_dict(col_dict) for col_dict in gc_response["status"]["collections"] ] async def list_collection_names( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all collections in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of the collection names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload: dict[str, Any] = {"findCollections": {}} logger.info("findCollections") gc_response = await driver_commander.async_request( payload=gc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: logger.info("finished findCollections") return gc_response["status"]["collections"] # type: ignore[no-any-return] @overload def get_table( self, name: str, *, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[DefaultRowType]: ... @overload def get_table( self, name: str, *, row_type: type[ROW], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: ... def get_table( self, name: str, *, row_type: type[Any] = DefaultRowType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Spawn an `AsyncTable` object instance representing a table on this database. Creating a `AsyncTable` instance does not have any effect on the actual state of the database: in other words, for the created `AsyncTable` instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling the `create_table` method). Args: name: the name of the table. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the table. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncTable` instance, representing the desired table (but without any form of validation). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Get an AsyncTable object (and read a property of it as an example): >>> my_async_table = async_database.get_table("games") >>> my_async_table.full_name 'default_keyspace.games' >>> >>> # Get an AsyncTable object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_async_table = async_database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> from astrapy import AsyncTable >>> MyCustomDictType = dict[str, int] >>> >>> # Get an AsyncTable object typed with a specific type for its rows: >>> my_typed_async_table: AsyncTable[MyCustomDictType] = async_database.get_table( ... "games", ... row_type=MyCustomDictType, ... ) """ # lazy importing here against circular-import error from astrapy.table import AsyncTable resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return AsyncTable[ROW]( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, ) @overload async def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[DefaultRowType]: ... @overload async def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[ROW], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: ... async def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = DefaultRowType, keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Creates a table on the database and return the AsyncTable instance that represents it. This is a blocking operation: the method returns when the table is ready to be used. As opposed to the `get_table` method call, this method causes the table to be actually created on DB. Args: name: the name of the table. definition: a complete table definition for the table. This can be an instance of `CreateTableDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CreateTableDefinition`. See the `astrapy.info.CreateTableDefinition` class and the `AsyncTable` class for more details and ways to construct this object. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the table is to be created. If not specified, the general setting for this database is used. if_not_exists: if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncTable` instance, representing the newly-created table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_async_table = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition, ... )) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_async_table_1 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... )) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_async_table_2 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... )) """ ct_options: dict[str, bool] if if_not_exists is not None: ct_options = {"ifNotExists": if_not_exists} else: ct_options = {} ct_definition: dict[str, Any] = CreateTableDefinition.coerce( definition ).as_dict() _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) ct_payload = { "createTable": { k: v for k, v in { "name": name, "definition": ct_definition, "options": ct_options, }.items() if v is not None if v != {} } } logger.info(f"createTable('{name}')") ct_response = await driver_commander.async_request( payload=ct_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ct_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createTable API command.", raw_response=ct_response, ) logger.info(f"finished createTable('{name}')") return self.get_table( name, row_type=row_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, ) async def drop_table_index( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drops (deletes) an index (of any kind) from the table it is associated to. This is a blocking operation: the method returns once the index is deleted. Note: Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call. Args: name: the name of the index. keyspace: the keyspace to which the index belongs. If not specified, the general setting for this database is used. if_exists: if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Drop an index from the keyspace: >>> await async_database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> await async_database.drop_table_index("score_index", if_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) di_options: dict[str, bool] if if_exists is not None: di_options = {"ifExists": if_exists} else: di_options = {} di_payload = { "dropIndex": { k: v for k, v in { "name": name, "options": di_options, }.items() if v is not None if v != {} } } driver_commander = self._get_driver_commander(keyspace=keyspace) logger.info(f"dropIndex('{name}')") di_response = await driver_commander.async_request( payload=di_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if di_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropIndex API command.", raw_response=di_response, ) logger.info(f"finished dropIndex('{name}')") async def drop_table( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop a table from the database, along with all rows therein and related indexes. Args: name: the name of the table to drop. keyspace: the keyspace where the table resides. If not specified, the database working keyspace is assumed. if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_table_names()) ['fighters', 'games'] >>> asyncio.run(async_database.drop_table("fighters")) >>> asyncio.run(async_database.list_table_names()) ['games'] >>> # not erroring because of if_not_exists: >>> asyncio.run(async_database.drop_table("fighters", if_not_exists=True)) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace dt_options: dict[str, bool] if if_exists is not None: dt_options = {"ifExists": if_exists} else: dt_options = {} driver_commander = self._get_driver_commander(keyspace=_keyspace) dt_payload = { "dropTable": { k: v for k, v in { "name": name, "options": dt_options, }.items() if v is not None if v != {} } } logger.info(f"dropTable('{name}')") dt_response = await driver_commander.async_request( payload=dt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if dt_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropTable API command.", raw_response=dt_response, ) logger.info(f"finished dropTable('{name}')") return dt_response.get("status", {}) # type: ignore[no-any-return] async def list_tables( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[ListTableDescriptor]: """ List all tables in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of ListTableDescriptor instances, one for each table. Example: >>> tables = asyncio.run(my_async_database.list_tables()) >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1} """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._list_tables_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) async def _list_tables_ctx( self, *, keyspace: str | None, timeout_context: _TimeoutContext, ) -> list[ListTableDescriptor]: driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload = {"listTables": {"options": {"explain": True}}} logger.info("listTables") lt_response = driver_commander.request( payload=lt_payload, timeout_context=timeout_context, ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: # we know this is a list of dicts, to marshal into "descriptors" logger.info("finished listTables") return [ ListTableDescriptor.coerce(tab_dict) for tab_dict in lt_response["status"]["tables"] ] async def list_table_names( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all tables in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the table names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def destroy_temp_table(async_db: AsyncDatabase) -> None: ... print(await async_db.list_table_names()) ... await async_db.drop_table("my_v_tab") ... print(await async_db.list_table_names()) ... >>> asyncio.run(destroy_temp_table(async_database)) ['fighters', 'my_v_tab', 'games'] ['fighters', 'games'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload: dict[str, Any] = {"listTables": {}} logger.info("listTables") lt_response = await driver_commander.async_request( payload=lt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: logger.info("finished listTables") return lt_response["status"]["tables"] # type: ignore[no-any-return] async def command( self, body: dict[str, Any], *, keyspace: str | None | UnsetType = _UNSET, collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this database with an arbitrary, caller-provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. keyspace: the keyspace to use, if any. If a keyspace is employed, it is used to construct the full request URL. To run a command targeting no specific keyspace (rather, the database as a whole), pass an explicit `None`: the request URL will lack the suffix "/<keyspace>" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name: if provided, the name is appended at the end of the endpoint. In this way, this method allows collection- and table-level arbitrary POST requests as well. This parameter cannot be used if `keyspace=None` is explicitly provided. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace: str | None if keyspace is None: if collection_or_table_name is not None: raise ValueError( "Cannot pass collection_or_table_name to database " "`command` on a no-keyspace command" ) _keyspace = None else: if isinstance(keyspace, UnsetType): _keyspace = self.keyspace else: _keyspace = keyspace # build the ad-hoc-commander path with _keyspace and the coll.or.table base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, _keyspace, collection_or_table_name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" command_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) _cmd_desc = ",".join(sorted(body.keys())) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") req_response = await command_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") return req_response def get_database_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> DatabaseAdmin: """ Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces. This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin. Args: token: an access token with enough permission on the database to perform the desired tasks. If omitted (as it can generally be done), the token of this Database is used. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of `AstraDBDatabaseAdmin` is returned; for other environments, an instance of `DataAPIDatabaseAdmin` is returned. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db_admin = async_database.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace'] """ # lazy importing here to avoid circular dependency from astrapy.admin.admin import AstraDBDatabaseAdmin, DataAPIDatabaseAdmin arg_api_options = APIOptions( token=token, ) api_options = self.api_options.with_override(spawn_api_options).with_override( arg_api_options ) if api_options.environment in Environment.astra_db_values: return AstraDBDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, ) else: return DataAPIDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, )
Instance variables
var id : str
-
The ID of this database.
Example
>>> my_async_database.id '01234567-89ab-cdef-0123-456789abcdef'
Expand source code
@property def id(self) -> str: """ The ID of this database. Example: >>> my_async_database.id '01234567-89ab-cdef-0123-456789abcdef' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.database_id else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." )
var keyspace : str | None
-
The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified.
Returns
the working keyspace (a string), or None if not set.
Example
>>> async_database.keyspace 'the_keyspace'
Expand source code
@property def keyspace(self) -> str | None: """ The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified. Returns: the working keyspace (a string), or None if not set. Example: >>> async_database.keyspace 'the_keyspace' """ return self._using_keyspace
var region : str
-
The region where this database is located.
The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint).
Example
>>> my_async_database.region 'us-west-2'
Expand source code
@property def region(self) -> str: """ The region where this database is located. The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint). Example: >>> my_async_database.region 'us-west-2' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.region else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." )
Methods
async def command(self, body: dict[str, Any], *, keyspace: str | None | UnsetType = (unset), collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this database with an arbitrary, caller-provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
keyspace
- the keyspace to use, if any. If a keyspace is employed,
it is used to construct the full request URL. To run a command
targeting no specific keyspace (rather, the database as a whole),
pass an explicit
None
: the request URL will lack the suffix "/" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name
- if provided, the name is appended at the end
of the endpoint. In this way, this method allows collection-
and table-level arbitrary POST requests as well.
This parameter cannot be used if
keyspace=None
is explicitly provided. raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}}
Expand source code
async def command( self, body: dict[str, Any], *, keyspace: str | None | UnsetType = _UNSET, collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this database with an arbitrary, caller-provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. keyspace: the keyspace to use, if any. If a keyspace is employed, it is used to construct the full request URL. To run a command targeting no specific keyspace (rather, the database as a whole), pass an explicit `None`: the request URL will lack the suffix "/<keyspace>" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name: if provided, the name is appended at the end of the endpoint. In this way, this method allows collection- and table-level arbitrary POST requests as well. This parameter cannot be used if `keyspace=None` is explicitly provided. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace: str | None if keyspace is None: if collection_or_table_name is not None: raise ValueError( "Cannot pass collection_or_table_name to database " "`command` on a no-keyspace command" ) _keyspace = None else: if isinstance(keyspace, UnsetType): _keyspace = self.keyspace else: _keyspace = keyspace # build the ad-hoc-commander path with _keyspace and the coll.or.table base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, _keyspace, collection_or_table_name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" command_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) _cmd_desc = ",".join(sorted(body.keys())) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") req_response = await command_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") return req_response
async def create_collection(self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncCollection[DOC]
-
Creates a collection on the database and return the AsyncCollection instance that represents it.
This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the
get_collection
instance, this method triggers causes the collection to be actually created on DB.Args
name
- the name of the collection.
definition
- a complete collection definition for the table. This can be an
instance of
CollectionDefinition
or an equivalent (nested) dictionary, in which case it will be parsed into aCollectionDefinition
. See theCollectionDefinition
class and theAsyncCollection
class for more details and ways to construct this object. document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting AsyncCollection is implicitly
an
AsyncCollection[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace where the collection is to be created. If not specified, the general setting for this database is used.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply.
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
an
AsyncCollection
instance, representing the newly-created collection.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition, ... )) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_1, ... )) >>> >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_2, ... ))
Expand source code
async def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Creates a collection on the database and return the AsyncCollection instance that represents it. This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the `get_collection` instance, this method triggers causes the collection to be actually created on DB. Args: name: the name of the collection. definition: a complete collection definition for the table. This can be an instance of `CollectionDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CollectionDefinition`. See the `astrapy.info.CollectionDefinition` class and the `AsyncCollection` class for more details and ways to construct this object. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncCollection is implicitly an `AsyncCollection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the collection is to be created. If not specified, the general setting for this database is used. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncCollection` instance, representing the newly-created collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition, ... )) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_1, ... )) >>> >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = asyncio.run(async_database.create_collection( ... "my_events", ... definition=collection_definition_2, ... )) """ cc_definition: dict[str, Any] = CollectionDefinition.coerce( definition or {} ).as_dict() if collection_admin_timeout_ms is not None: _collection_admin_timeout_ms = collection_admin_timeout_ms _ca_label = "collection_admin_timeout_ms" else: _collection_admin_timeout_ms = ( self.api_options.timeout_options.collection_admin_timeout_ms ) _ca_label = "collection_admin_timeout_ms" driver_commander = self._get_driver_commander(keyspace=keyspace) cc_payload = { "createCollection": { k: v for k, v in { "name": name, "options": cc_definition, }.items() if v is not None if v != {} } } logger.info(f"createCollection('{name}')") cc_response = await driver_commander.async_request( payload=cc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if cc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createCollection API command.", raw_response=cc_response, ) logger.info(f"finished createCollection('{name}')") return self.get_collection( name, document_type=document_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, )
async def create_table(self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncTable[ROW]
-
Creates a table on the database and return the AsyncTable instance that represents it.
This is a blocking operation: the method returns when the table is ready to be used. As opposed to the
get_table
method call, this method causes the table to be actually created on DB.Args
name
- the name of the table.
definition
- a complete table definition for the table. This can be an
instance of
CreateTableDefinition
or an equivalent (nested) dictionary, in which case it will be parsed into aCreateTableDefinition
. See theCreateTableDefinition
class and theAsyncTable
class for more details and ways to construct this object. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting AsyncTable is implicitly
an
AsyncTable[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace where the table is to be created. If not specified, the general setting for this database is used.
if_not_exists
- if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
. embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
an
AsyncTable
instance, representing the newly-created table.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_async_table = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition, ... )) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_async_table_1 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... )) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_async_table_2 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... ))
Expand source code
async def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = DefaultRowType, keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Creates a table on the database and return the AsyncTable instance that represents it. This is a blocking operation: the method returns when the table is ready to be used. As opposed to the `get_table` method call, this method causes the table to be actually created on DB. Args: name: the name of the table. definition: a complete table definition for the table. This can be an instance of `CreateTableDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CreateTableDefinition`. See the `astrapy.info.CreateTableDefinition` class and the `AsyncTable` class for more details and ways to construct this object. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the table is to be created. If not specified, the general setting for this database is used. if_not_exists: if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncTable` instance, representing the newly-created table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_async_table = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition, ... )) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_async_table_1 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... )) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_async_table_2 = asyncio.run(async_database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... )) """ ct_options: dict[str, bool] if if_not_exists is not None: ct_options = {"ifNotExists": if_not_exists} else: ct_options = {} ct_definition: dict[str, Any] = CreateTableDefinition.coerce( definition ).as_dict() _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) ct_payload = { "createTable": { k: v for k, v in { "name": name, "definition": ct_definition, "options": ct_options, }.items() if v is not None if v != {} } } logger.info(f"createTable('{name}')") ct_response = await driver_commander.async_request( payload=ct_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ct_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createTable API command.", raw_response=ct_response, ) logger.info(f"finished createTable('{name}')") return self.get_table( name, row_type=row_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, )
async def drop_collection(self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Drop a collection from the database, along with all documents therein.
Args
name
- the name of the collection to drop.
keyspace
- the keyspace where the collection resides. If not specified, the database working keyspace is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'my_v_col', 'another_col'] >>> asyncio.run(async_database.drop_collection("my_v_col")) >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col']
Expand source code
async def drop_collection( self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop a collection from the database, along with all documents therein. Args: name: the name of the collection to drop. keyspace: the keyspace where the collection resides. If not specified, the database working keyspace is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'my_v_col', 'another_col'] >>> asyncio.run(async_database.drop_collection("my_v_col")) >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace driver_commander = self._get_driver_commander(keyspace=_keyspace) dc_payload = {"deleteCollection": {"name": name}} logger.info(f"deleteCollection('{name}')") dc_response = await driver_commander.async_request( payload=dc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if dc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteCollection API command.", raw_response=dc_response, ) logger.info(f"finished deleteCollection('{name}')") return dc_response.get("status", {}) # type: ignore[no-any-return]
async def drop_table(self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Drop a table from the database, along with all rows therein and related indexes.
Args
name
- the name of the table to drop.
keyspace
- the keyspace where the table resides. If not specified, the database working keyspace is assumed.
if_exists
- if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_table_names()) ['fighters', 'games'] >>> asyncio.run(async_database.drop_table("fighters")) >>> asyncio.run(async_database.list_table_names()) ['games'] >>> # not erroring because of if_not_exists: >>> asyncio.run(async_database.drop_table("fighters", if_not_exists=True))
Expand source code
async def drop_table( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop a table from the database, along with all rows therein and related indexes. Args: name: the name of the table to drop. keyspace: the keyspace where the table resides. If not specified, the database working keyspace is assumed. if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_table_names()) ['fighters', 'games'] >>> asyncio.run(async_database.drop_table("fighters")) >>> asyncio.run(async_database.list_table_names()) ['games'] >>> # not erroring because of if_not_exists: >>> asyncio.run(async_database.drop_table("fighters", if_not_exists=True)) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace dt_options: dict[str, bool] if if_exists is not None: dt_options = {"ifExists": if_exists} else: dt_options = {} driver_commander = self._get_driver_commander(keyspace=_keyspace) dt_payload = { "dropTable": { k: v for k, v in { "name": name, "options": dt_options, }.items() if v is not None if v != {} } } logger.info(f"dropTable('{name}')") dt_response = await driver_commander.async_request( payload=dt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if dt_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropTable API command.", raw_response=dt_response, ) logger.info(f"finished dropTable('{name}')") return dt_response.get("status", {}) # type: ignore[no-any-return]
async def drop_table_index(self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drops (deletes) an index (of any kind) from the table it is associated to.
This is a blocking operation: the method returns once the index is deleted.
Note
Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call.
Args
name
- the name of the index.
keyspace
- the keyspace to which the index belongs. If not specified, the general setting for this database is used.
if_exists
- if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Drop an index from the keyspace: >>> await async_database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> await async_database.drop_table_index("score_index", if_exists=True)
Expand source code
async def drop_table_index( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drops (deletes) an index (of any kind) from the table it is associated to. This is a blocking operation: the method returns once the index is deleted. Note: Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call. Args: name: the name of the index. keyspace: the keyspace to which the index belongs. If not specified, the general setting for this database is used. if_exists: if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Drop an index from the keyspace: >>> await async_database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> await async_database.drop_table_index("score_index", if_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) di_options: dict[str, bool] if if_exists is not None: di_options = {"ifExists": if_exists} else: di_options = {} di_payload = { "dropIndex": { k: v for k, v in { "name": name, "options": di_options, }.items() if v is not None if v != {} } } driver_commander = self._get_driver_commander(keyspace=keyspace) logger.info(f"dropIndex('{name}')") di_response = await driver_commander.async_request( payload=di_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if di_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropIndex API command.", raw_response=di_response, ) logger.info(f"finished dropIndex('{name}')")
def get_collection(self, name: str, *, document_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncCollection[DOC]
-
Spawn an
AsyncCollection
object instance representing a collection on this database.Creating an
AsyncCollection
instance does not have any effect on the actual state of the database: in other words, for the createdAsyncCollection
instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling thecreate_collection
method).Args
name
- the name of the collection.
document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting AsyncCollection is implicitly
an
AsyncCollection[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace containing the collection. If no keyspace is specified, the setting for this database is used.
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
an
AsyncCollection
instance, representing the desired collection (but without any form of validation).Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def count_docs(adb: AsyncDatabase, c_name: str) -> int: ... async_col = adb.get_collection(c_name) ... return await async_col.count_documents({}, upper_bound=100) ... >>> asyncio.run(count_docs(async_database, "my_collection")) 45
Note: the attribute and indexing syntax forms achieve the same effect as this method, returning an AsyncCollection. In other words, the following are equivalent: async_database.get_collection("coll_name") async_database.coll_name async_database["coll_name"]
Expand source code
def get_collection( self, name: str, *, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Spawn an `AsyncCollection` object instance representing a collection on this database. Creating an `AsyncCollection` instance does not have any effect on the actual state of the database: in other words, for the created `AsyncCollection` instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling the `create_collection` method). Args: name: the name of the collection. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncCollection is implicitly an `AsyncCollection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the collection. If no keyspace is specified, the setting for this database is used. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncCollection` instance, representing the desired collection (but without any form of validation). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def count_docs(adb: AsyncDatabase, c_name: str) -> int: ... async_col = adb.get_collection(c_name) ... return await async_col.count_documents({}, upper_bound=100) ... >>> asyncio.run(count_docs(async_database, "my_collection")) 45 Note: the attribute and indexing syntax forms achieve the same effect as this method, returning an AsyncCollection. In other words, the following are equivalent: async_database.get_collection("coll_name") async_database.coll_name async_database["coll_name"] """ # lazy importing here against circular-import error from astrapy.collection import AsyncCollection resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return AsyncCollection( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, )
def get_database_admin(self, *, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> DatabaseAdmin
-
Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces.
This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin.
Args
token
- an access token with enough permission on the database to
perform the desired tasks. If omitted (as it can generally be done),
the token of this Database is used.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of
AstraDBDatabaseAdmin
is returned; for other environments, an instance ofDataAPIDatabaseAdmin
is returned.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db_admin = async_database.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace']
Expand source code
def get_database_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> DatabaseAdmin: """ Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces. This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin. Args: token: an access token with enough permission on the database to perform the desired tasks. If omitted (as it can generally be done), the token of this Database is used. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of `AstraDBDatabaseAdmin` is returned; for other environments, an instance of `DataAPIDatabaseAdmin` is returned. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> my_db_admin = async_database.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace'] """ # lazy importing here to avoid circular dependency from astrapy.admin.admin import AstraDBDatabaseAdmin, DataAPIDatabaseAdmin arg_api_options = APIOptions( token=token, ) api_options = self.api_options.with_override(spawn_api_options).with_override( arg_api_options ) if api_options.environment in Environment.astra_db_values: return AstraDBDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, ) else: return DataAPIDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, )
def get_table(self, name: str, *, row_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncTable[ROW]
-
Spawn an
AsyncTable
object instance representing a table on this database.Creating a
AsyncTable
instance does not have any effect on the actual state of the database: in other words, for the createdAsyncTable
instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling thecreate_table
method).Args
name
- the name of the table.
row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting AsyncTable is implicitly
an
AsyncTable[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace containing the table. If no keyspace is specified, the general setting for this database is used.
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
an
AsyncTable
instance, representing the desired table (but without any form of validation).Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Get an AsyncTable object (and read a property of it as an example): >>> my_async_table = async_database.get_table("games") >>> my_async_table.full_name 'default_keyspace.games' >>> >>> # Get an AsyncTable object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_async_table = async_database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> from astrapy import AsyncTable >>> MyCustomDictType = dict[str, int] >>> >>> # Get an AsyncTable object typed with a specific type for its rows: >>> my_typed_async_table: AsyncTable[MyCustomDictType] = async_database.get_table( ... "games", ... row_type=MyCustomDictType, ... )
Expand source code
def get_table( self, name: str, *, row_type: type[Any] = DefaultRowType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Spawn an `AsyncTable` object instance representing a table on this database. Creating a `AsyncTable` instance does not have any effect on the actual state of the database: in other words, for the created `AsyncTable` instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling the `create_table` method). Args: name: the name of the table. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the table. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: an `AsyncTable` instance, representing the desired table (but without any form of validation). Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Get an AsyncTable object (and read a property of it as an example): >>> my_async_table = async_database.get_table("games") >>> my_async_table.full_name 'default_keyspace.games' >>> >>> # Get an AsyncTable object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_async_table = async_database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> from astrapy import AsyncTable >>> MyCustomDictType = dict[str, int] >>> >>> # Get an AsyncTable object typed with a specific type for its rows: >>> my_typed_async_table: AsyncTable[MyCustomDictType] = async_database.get_table( ... "games", ... row_type=MyCustomDictType, ... ) """ # lazy importing here against circular-import error from astrapy.table import AsyncTable resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return AsyncTable[ROW]( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, )
async def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBDatabaseInfo
-
Additional information on the database as a AstraDBDatabaseInfo instance.
Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.info()).region 'eu-west-1' >>> asyncio.run( ... async_database.info() ... ).raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z'
Note
see the AstraDBDatabaseInfo documentation for a caveat about the difference between the
region
and theraw["region"]
attributes.Expand source code
async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBDatabaseInfo: """ Additional information on the database as a AstraDBDatabaseInfo instance. Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.info()).region 'eu-west-1' >>> asyncio.run( ... async_database.info() ... ).raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z' Note: see the AstraDBDatabaseInfo documentation for a caveat about the difference between the `region` and the `raw["region"]` attributes. """ if self.api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting database info") database_info = await async_fetch_database_info( self.api_endpoint, keyspace=self.keyspace, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if database_info is not None: logger.info("finished getting database info") return database_info else: raise DevOpsAPIException("Failure while fetching database info.")
async def list_collection_names(self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all collections in a given keyspace of this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a list of the collection names as strings, in no particular order.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col']
Expand source code
async def list_collection_names( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all collections in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of the collection names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload: dict[str, Any] = {"findCollections": {}} logger.info("findCollections") gc_response = await driver_commander.async_request( payload=gc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: logger.info("finished findCollections") return gc_response["status"]["collections"] # type: ignore[no-any-return]
async def list_collections(self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[CollectionDescriptor]
-
List all collections in a given keyspace for this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a list of CollectionDescriptor instances one for each collection.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def a_list_colls(adb: AsyncDatabase) -> None: ... a_coll_list = await adb.list_collections() ... print("* list:", a_coll_list) ... for coll in await adb.list_collections(): ... print("* coll:", coll) ... >>> asyncio.run(a_list_colls(async_database)) * list: [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] * coll: CollectionDescriptor(name='my_v_col', options=CollectionDefinition())
Expand source code
async def list_collections( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[CollectionDescriptor]: """ List all collections in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of CollectionDescriptor instances one for each collection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def a_list_colls(adb: AsyncDatabase) -> None: ... a_coll_list = await adb.list_collections() ... print("* list:", a_coll_list) ... for coll in await adb.list_collections(): ... print("* coll:", coll) ... >>> asyncio.run(a_list_colls(async_database)) * list: [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] * coll: CollectionDescriptor(name='my_v_col', options=CollectionDefinition()) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._list_collections_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), )
async def list_table_names(self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all tables in a given keyspace of this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of the table names as strings, in no particular order.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> async def destroy_temp_table(async_db: AsyncDatabase) -> None: ... print(await async_db.list_table_names()) ... await async_db.drop_table("my_v_tab") ... print(await async_db.list_table_names()) ... >>> asyncio.run(destroy_temp_table(async_database)) ['fighters', 'my_v_tab', 'games'] ['fighters', 'games']
Expand source code
async def list_table_names( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all tables in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the table names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> async def destroy_temp_table(async_db: AsyncDatabase) -> None: ... print(await async_db.list_table_names()) ... await async_db.drop_table("my_v_tab") ... print(await async_db.list_table_names()) ... >>> asyncio.run(destroy_temp_table(async_database)) ['fighters', 'my_v_tab', 'games'] ['fighters', 'games'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload: dict[str, Any] = {"listTables": {}} logger.info("listTables") lt_response = await driver_commander.async_request( payload=lt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: logger.info("finished listTables") return lt_response["status"]["tables"] # type: ignore[no-any-return]
async def list_tables(self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[ListTableDescriptor]
-
List all tables in a given keyspace for this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of ListTableDescriptor instances, one for each table.
Example
>>> tables = asyncio.run(my_async_database.list_tables()) >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1}
Expand source code
async def list_tables( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[ListTableDescriptor]: """ List all tables in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of ListTableDescriptor instances, one for each table. Example: >>> tables = asyncio.run(my_async_database.list_tables()) >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1} """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return await self._list_tables_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), )
async def name(self) ‑> str
-
The name of this database. Note that this bears no unicity guarantees.
Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the
astrapy.info
method for more details.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.name()) 'the_application_database'
Expand source code
async def name(self) -> str: """ The name of this database. Note that this bears no unicity guarantees. Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the `info()` method for more details. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.name()) 'the_application_database' """ if self._name is None: self._name = (await self.info()).name return self._name
def to_sync(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Create a (synchronous) Database from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy.
Args
keyspace
- this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set.
token
- an Access Token to the database. Example: "AstraCS:xyz…"
This can be either a literal token string or a subclass of
TokenProvider
. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, a
Database
instance.Example
>>> my_sync_db = async_database.to_sync() >>> my_sync_db.list_collection_names() ['a_collection', 'another_collection']
Expand source code
def to_sync( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a (synchronous) Database from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: "AstraCS:xyz..." This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a `Database` instance. Example: >>> my_sync_db = async_database.to_sync() >>> my_sync_db.list_collection_names() ['a_collection', 'another_collection'] """ arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Database( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, )
def use_keyspace(self, keyspace: str) ‑> None
-
Switch to a new working keyspace for this database. This method changes (mutates) the AsyncDatabase instance.
Note that this method does not create the keyspace, which should exist already (created for instance with a
DatabaseAdmin.async_create_keyspace
call).Args
keyspace
- the new keyspace to use as the database working keyspace.
Returns
None.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['coll_1', 'coll_2'] >>> async_database.use_keyspace("an_empty_keyspace") >>> asyncio.run(async_database.list_collection_names()) []
Expand source code
def use_keyspace(self, keyspace: str) -> None: """ Switch to a new working keyspace for this database. This method changes (mutates) the AsyncDatabase instance. Note that this method does not create the keyspace, which should exist already (created for instance with a `DatabaseAdmin.async_create_keyspace` call). Args: keyspace: the new keyspace to use as the database working keyspace. Returns: None. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(async_database.list_collection_names()) ['coll_1', 'coll_2'] >>> async_database.use_keyspace("an_empty_keyspace") >>> asyncio.run(async_database.list_collection_names()) [] """ logger.info(f"switching to keyspace '{keyspace}'") self._using_keyspace = keyspace self._api_commander = self._get_api_commander(keyspace=self.keyspace)
def with_options(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Create a clone of this database with some changed attributes.
Args
keyspace
- this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set.
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new
AsyncDatabase
instance.Example
>>> async_database_2 = async_database.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... )
Expand source code
def with_options( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create a clone of this database with some changed attributes. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new `AsyncDatabase` instance. Example: >>> async_database_2 = async_database.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... ) """ return self._copy( keyspace=keyspace, token=token, api_options=api_options, )
class AsyncTable (*, database: AsyncDatabase, name: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API table, the object to interact with the Data API for structured data, especially for DDL operations. This class has an asynchronous interface for use with asyncio.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_table
of AsyncDatabase, wherefrom the AsyncTable inherits its API options such as authentication token and API endpoint. In order to create a table, instead, one should call thecreate_table
method of AsyncDatabase, providing a table definition parameter that can be built in different ways (see theCreateTableDefinition
object and examples below).Args
database
- an AsyncDatabase object, instantiated earlier. This represents the database the table belongs to.
name
- the table name. This parameter should match an existing table on the database.
keyspace
- this is the keyspace to which the table belongs. If nothing is specified, the database's working keyspace is used.
api_options
- a complete specification of the API Options for this instance.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy import DataAPIClient, AsyncTable >>> client = astrapy.DataAPIClient() >>> async_database = client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... )
>>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = await async_database.create_table( ... "games", ... definition=table_definition, ... )
>>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = await async_database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... )
>>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = await async_database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... )
>>> # Get a reference to an existing table >>> # (no checks are performed on DB) >>> my_table_4 = async_database.get_table("my_already_existing_table")
Note
creating an instance of AsyncTable does not trigger, in itself, actual creation of the table on the database. The latter should have been created beforehand, e.g. through the
create_table
method of a Database.Expand source code
class AsyncTable(Generic[ROW]): """ A Data API table, the object to interact with the Data API for structured data, especially for DDL operations. This class has an asynchronous interface for use with asyncio. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_table` of AsyncDatabase, wherefrom the AsyncTable inherits its API options such as authentication token and API endpoint. In order to create a table, instead, one should call the `create_table` method of AsyncDatabase, providing a table definition parameter that can be built in different ways (see the `CreateTableDefinition` object and examples below). Args: database: an AsyncDatabase object, instantiated earlier. This represents the database the table belongs to. name: the table name. This parameter should match an existing table on the database. keyspace: this is the keyspace to which the table belongs. If nothing is specified, the database's working keyspace is used. api_options: a complete specification of the API Options for this instance. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy import DataAPIClient, AsyncTable >>> client = astrapy.DataAPIClient() >>> async_database = client.get_async_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... ) >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = await async_database.create_table( ... "games", ... definition=table_definition, ... ) >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = await async_database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... ) >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = await async_database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... ) >>> # Get a reference to an existing table >>> # (no checks are performed on DB) >>> my_table_4 = async_database.get_table("my_already_existing_table") Note: creating an instance of AsyncTable does not trigger, in itself, actual creation of the table on the database. The latter should have been created beforehand, e.g. through the `create_table` method of a Database. """ def __init__( self, *, database: AsyncDatabase, name: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self._name = name _keyspace = keyspace if keyspace is not None else database.keyspace if _keyspace is None: raise ValueError("Attempted to create AsyncTable with 'keyspace' unset.") self._database = database._copy( keyspace=_keyspace, api_options=self.api_options ) self._commander_headers = { **{DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token()}, **self.api_options.embedding_api_key.get_headers(), **self.api_options.database_additional_headers, } self._api_commander = self._get_api_commander() self._converter_agent: _TableConverterAgent[ROW] = _TableConverterAgent( options=self.api_options.serdes_options, ) def __repr__(self) -> str: _db_desc = f'database.api_endpoint="{self.database.api_endpoint}"' return ( f'{self.__class__.__name__}(name="{self.name}", ' f'keyspace="{self.keyspace}", {_db_desc}, ' f"api_options={self.api_options})" ) def __eq__(self, other: Any) -> bool: if isinstance(other, AsyncTable): return all( [ self._name == other._name, self._database == other._database, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self) -> APICommander: """Instantiate a new APICommander based on the properties of this class.""" if self._database.keyspace is None: raise ValueError( "No keyspace specified. AsyncTable requires a keyspace to " "be set, e.g. through the `keyspace` constructor parameter." ) base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self._database.api_options.data_api_url_options.api_path, self._database.api_options.data_api_url_options.api_version, self._database.keyspace, self._name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self._database.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, handle_decimals_writes=True, handle_decimals_reads=True, ) return api_commander async def __aenter__(self: AsyncTable[ROW]) -> AsyncTable[ROW]: return self async def __aexit__( self, exc_type: type[BaseException] | None = None, exc_value: BaseException | None = None, traceback: TracebackType | None = None, ) -> None: if self._api_commander is not None: await self._api_commander.__aexit__( exc_type=exc_type, exc_value=exc_value, traceback=traceback, ) def _copy( self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncTable( database=self.database, name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def with_options( self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Create a clone of this table with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AsyncTable instance. Example: >>> table_with_api_key_configured = my_async_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, ) def to_sync( self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Create a Table from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a Table instance. Example: >>> my_async_table.to_sync().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... ) {"pk": 1, "column": "value} """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Table( database=self.database.to_sync(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) async def definition( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ListTableDefinition: """ Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a `create_table` method call. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: A `ListTableDefinition` object, available for inspection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_table.definition()) ListTableDefinition(columns=[match_id,round,fighters, ... # shortened """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting tables in search of '{self.name}'") self_descriptors = [ table_desc for table_desc in await self.database._list_tables_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label, ), ) if table_desc.name == self.name ] logger.info(f"finished getting tables in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Table {self.keyspace}.{self.name} not found.", ) async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInfo: """ Return information on the table. This should not be confused with the table definition (i.e. the schema). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A TableInfo object for inspection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Note: output reformatted for clarity. >>> asyncio.run(my_async_table.info()) TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' ) """ db_info = await self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return TableInfo( database_info=db_info, keyspace=self.keyspace, name=self.name, full_name=self.full_name, ) @property def database(self) -> AsyncDatabase: """ a Database object, the database this table belongs to. Example: >>> my_async_table.database.name 'the_db' """ return self._database @property def keyspace(self) -> str: """ The keyspace this table is in. Example: >>> my_async_table.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The table's DB is set with keyspace=None") return _keyspace @property def name(self) -> str: """ The name of this table. Example: >>> my_async_table.name 'my_table' """ return self._name @property def full_name(self) -> str: """ The fully-qualified table name within the database, in the form "keyspace.table_name". Example: >>> my_async_table.full_name 'default_keyspace.my_table' """ return f"{self.keyspace}.{self.name}" async def _create_generic_index( self, i_name: str, ci_definition: dict[str, Any], ci_command: str, if_not_exists: bool | None, table_admin_timeout_ms: int | None, request_timeout_ms: int | None, timeout_ms: int | None, ) -> None: ci_options: dict[str, bool] if if_not_exists is not None: ci_options = {"ifNotExists": if_not_exists} else: ci_options = {} _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ci_payload = { ci_command: { "name": i_name, "definition": ci_definition, "options": ci_options, } } logger.info(f"{ci_command}('{i_name}')") ci_response = await self._api_commander.async_request( payload=ci_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ci_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text=f"Faulty response from {ci_command} API command.", raw_response=ci_response, ) logger.info(f"finished {ci_command}('{i_name}')") async def create_index( self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create an index on a non-vector column of the table. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a vector index, see method `create_vector_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column on which the index is to be created. options: if passed, it must be an instance of `TableIndexOptions`, or an equivalent dictionary, which specifies index settings such as -- for a text column -- case-sensitivity and so on. See the `astrapy.info.TableIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> await my_async_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> await my_async_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... ) """ ci_definition: dict[str, Any] = TableIndexDefinition( column=column, options=TableIndexOptions.coerce(options or {}), ).as_dict() ci_command = "createIndex" return await self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) async def create_vector_index( self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create a vector index on a vector column of the table, enabling vector similarity search operations on it. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a non-vector index, see method `create_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column, of type "vector" on which to create the index. options: an instance of `TableVectorIndexOptions`, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See the `astrapy.info.TableVectorIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... ) """ ci_definition: dict[str, Any] = TableVectorIndexDefinition( column=column, options=TableVectorIndexOptions.coerce(options), ).as_dict() ci_command = "createVectorIndex" return await self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) async def list_index_names( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the index names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.list_index_names()) ['m_vector_index', 'winner_index', 'score_index'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {}}} logger.info("listIndexes") li_response = await self._api_commander.async_request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return li_response["status"]["indexes"] # type: ignore[no-any-return] async def list_indexes( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[TableIndexDescriptor]: """ List the full definitions of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of `astrapy.info.TableIndexDescriptor` objects in no particular order, each providing the details of an index present on the table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> indexes = asyncio.run(my_async_table.list_indexes()) >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] >>> # (Note: shortened output above) >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {"explain": True}}} logger.info("listIndexes") li_response = await self._api_commander.async_request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return [ TableIndexDescriptor.coerce(index_object) for index_object in li_response["status"]["indexes"] ] @overload async def alter( self, operation: AlterTableOperation | dict[str, Any], *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTable[DefaultRowType]: ... @overload async def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[NEW_ROW], table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTable[NEW_ROW]: ... async def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = DefaultRowType, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTable[NEW_ROW]: """ Executes one of the available alter-table operations on this table, such as adding/dropping columns. This is a blocking operation: the method returns once the index is created and ready to use. Args: operation: an instance of one of the `astrapy.info.AlterTable*` classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": ...}}. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = await my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = await new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = await new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import AsyncTable >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: AsyncTable[MyMatch] = await new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... ) """ n_operation: AlterTableOperation if isinstance(operation, AlterTableOperation): n_operation = operation else: n_operation = AlterTableOperation.from_full_dict(operation) _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) at_operation_name = n_operation._name at_payload = { "alterTable": { "operation": { at_operation_name: n_operation.as_dict(), }, }, } logger.info(f"alterTable({at_operation_name})") at_response = await self._api_commander.async_request( payload=at_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if at_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from alterTable API command.", raw_response=at_response, ) logger.info(f"finished alterTable({at_operation_name})") return AsyncTable( database=self.database, name=self.name, keyspace=self.keyspace, api_options=self.api_options, ) async def insert_one( self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertOneResult: """ Insert a single row in the table, with implied overwrite in case of primary key collision. Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: row: a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... )) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ... """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = self._converter_agent.preprocess_payload( {"insertOne": {"document": row}} ) logger.info(f"insertOne on '{self.name}'") io_response = await self._api_commander.async_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if not io_response["status"]["insertedIds"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'insertedIds'.", raw_response=io_response, ) if not io_response["status"]["primaryKeySchema"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'primaryKeySchema'.", raw_response=io_response, ) inserted_id_list = io_response["status"]["insertedIds"][0] inserted_id_tuple, inserted_id = self._converter_agent.postprocess_key( inserted_id_list, primary_key_schema_dict=io_response["status"]["primaryKeySchema"], ) return TableInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, inserted_id_tuple=inserted_id_tuple, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insertOne API command.", raw_response=io_response, ) def _prepare_keys_from_status( self, status: dict[str, Any] | None, raise_on_missing: bool = False ) -> tuple[list[dict[str, Any]], list[tuple[Any, ...]]]: ids: list[dict[str, Any]] id_tuples: list[tuple[Any, ...]] if status is None: if raise_on_missing: raise UnexpectedDataAPIResponseException( text="'status' not found in API response", raw_response=None, ) else: ids = [] id_tuples = [] else: if "primaryKeySchema" not in status: raise UnexpectedDataAPIResponseException( text=( "received a 'status' without 'primaryKeySchema' " f"in API response (received: {status})" ), raw_response=None, ) if "insertedIds" not in status: raise UnexpectedDataAPIResponseException( text=( "received a 'status' without 'insertedIds' " f"in API response (received: {status})" ), raw_response=None, ) primary_key_schema = status["primaryKeySchema"] id_tuples_and_ids = self._converter_agent.postprocess_keys( status["insertedIds"], primary_key_schema_dict=primary_key_schema, ) id_tuples = [tpl for tpl, _ in id_tuples_and_ids] ids = [id for _, id in id_tuples_and_ids] return ids, id_tuples async def insert_many( self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertManyResult: """ Insert a number of rows into the table, with implied overwrite in case of primary key collision. Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: rows: an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster. chunk_size: how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... )) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> asyncio.run(my_async_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... )) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ... Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important. Note: If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the `ordered` parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of type `astrapy.exceptions.TableInsertManyException`: its `partial_result` attribute is precisely a `TableInsertManyResult`, encoding details on the successful writes. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _rows = list(rows) logger.info(f"inserting {len(_rows)} rows in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] inserted_id_tuples: list[Any] = [] for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = await self._api_commander.async_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids, chunk_inserted_ids_tuples = ( self._prepare_keys_from_status(chunk_response.get("status")) ) inserted_ids += chunk_inserted_ids inserted_id_tuples += chunk_inserted_ids_tuples raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} sem = asyncio.Semaphore(_concurrency) async def concurrent_insert_chunk( row_chunk: list[ROW], ) -> dict[str, Any]: async with sem: im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": row_chunk, "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = await self._api_commander.async_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response if _concurrency > 1: tasks = [ asyncio.create_task( concurrent_insert_chunk(_rows[i : i + _chunk_size]) ) for i in range(0, len(_rows), _chunk_size) ] raw_results = await asyncio.gather(*tasks) else: raw_results = [ await concurrent_insert_chunk(_rows[i : i + _chunk_size]) for i in range(0, len(_rows), _chunk_size) ] # recast raw_results. Each response has its schema: unfold appropriately ids_and_tuples_per_chunk = [ self._prepare_keys_from_status(chunk_response.get("status")) for chunk_response in raw_results ] inserted_ids = [ inserted_id for chunk_ids, _ in ids_and_tuples_per_chunk for inserted_id in chunk_ids ] inserted_id_tuples = [ inserted_id_tuple for _, chunk_id_tuples in ids_and_tuples_per_chunk for inserted_id_tuple in chunk_id_tuples ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTableFindCursor[ROW, ROW]: ... @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2], skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTableFindCursor[ROW, ROW2]: ... def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTableFindCursor[ROW, ROW2]: """ Find rows on the table matching the provided filters and according to sorting criteria including vector similarity. The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the `TableFindCursor` documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism. Invoking `.to_list()` on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large. Args: filter: a dictionary expressing which condition the returned rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter, not recommended for large tables), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching rows. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly an `AsyncTableFindCursor[ROW, ROW]`, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: if provided, it is a number of rows that would be obtained first in the response and are instead skipped. limit: a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned row. It can be used meaningfully only in a vector search (see `sort`). include_sort_vector: a boolean to request the search query vector. If set to True (and if the search is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: this dictionary parameter controls the order in which the rows are returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `request_timeout_ms`. Returns: a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed. Note: As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Iterate over results: >>> async def loop1(): ... async for row in my_async_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop1()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> async def loop2(): ... async for row in my_async_table.find( ... {"match_id": "challenge6"}, ... projection=proj, ... ): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop2()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Filter on the partitioning: >>> asyncio.run( ... my_async_table.find({"match_id": "challenge6"}).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find( ... {"match_id": "challenge6", "round": 1} ... ).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"winner": "Caio Gozer"}).to_list()) [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"score": {"$gte": 15}}).to_list()) [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> asyncio.run(my_async_table.find({}).to_list()) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list()) [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list()) [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list()) [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and `limit`: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(asyncio.run( ... winner_cursor.map(lambda row: row["winner"].upper()).to_list()) ... )) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_async_cursor = my_async_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> asyncio.run(matches_async_cursor.has_next()) True >>> asyncio.run(matches_async_cursor.__anext__()) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_async_cursor.consumed 1 >>> matches_async_cursor.rewind() >>> matches_async_cursor.consumed 0 >>> asyncio.run(matches_async_cursor.has_next()) True >>> matches_async_cursor.close() >>> >>> async def try_consume(): ... try: ... await matches_async_cursor.__anext__() ... except StopAsyncIteration: ... print("StopAsyncIteration triggered.") ... >>> asyncio.run(try_consume()) StopAsyncIteration triggered. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncTableFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( AsyncTableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) ) async def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ROW | None: """ Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none. The parameters are analogous to some of the parameters to the `find` method (which has a few more that do not make sense in this case, such as `limit`). Args: filter: a dictionary expressing which condition the returned row must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching row. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned row. It can be used meaningfully only in a vector search (see `sort`). sort: this dictionary parameter controls the sorting order, hence determines which row is being returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the result if a row is found, otherwise None. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> asyncio.run(my_async_table.find_one({"match_id": "challenge6"})) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(asyncio.run(my_async_table.find_one({"match_id": "not_real"}))) 'None' >>> >>> # Optimize bandwidth using a projection: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... )) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find_one({"match_id": "challenge6", "round": 1}) ... ) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"winner": "Caio Gozer"})) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"score": {"$gte": 15}})) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> asyncio.run(my_async_table.find_one({})) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run( ... my_async_table.find_one({"round": 3, "winner": "Caio Gozer"}) ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> asyncio.run(my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... )) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... )) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... )) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = self._converter_agent.preprocess_payload( { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } ) fo_response = await self._api_commander.async_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'document'.", raw_response=fo_response, ) if "projectionSchema" not in (fo_response.get("status") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'projectionSchema'.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return self._converter_agent.postprocess_row( fo_response["data"]["document"], columns_dict=fo_response["status"]["projectionSchema"], similarity_pseudocolumn="$similarity" if include_similarity else None, ) async def distinct( self, key: str, *, filter: FilterType | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the rows in the table that match the provided filter. Args: key: the name of the field whose value is inspected across rows. Keys are typically just column names, although they can use the dot notation to select particular entries in map columns. For set and list columns, individual entries are "unrolled" automatically; in particular, for lists, numeric indices can be used in the key dot-notation syntax. Example of acceptable `key` values: "a_column" "map_column.map_key" "list_column.2" filter: a dictionary expressing which condition the inspected rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the rows that match the filter. The result list has no repeated items. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.distinct( ... "winner", ... filter={"match_id": "challenge6"}, ... )) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.distinct("winner")) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> asyncio.run(my_async_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> asyncio.run(my_async_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-... Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required rows using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncTableFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_shallow_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: AsyncTableFindCursor[dict[str, Any], dict[str, Any]] = ( AsyncTableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") async for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items async def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the row in the table matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching rows. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.insert_many([{"seq": i} for i in range(20)])) TableInsertManyResult(...) >>> asyncio.run(my_async_table.count_documents({}, upper_bound=100)) 20 >>> asyncio.run(my_async_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100)) 4 >>> asyncio.run(my_async_table.count_documents({}, upper_bound=10)) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = await self._api_commander.async_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyRowsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyRowsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, ) async def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the table. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.estimated_document_count()) 5820 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = await self._api_commander.async_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, ) async def update_one( self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. An example may be `{"match_id": "fight4", "round": 1}`. update: the update prescription to apply to the row, expressed as a dictionary conforming to the Data API syntax. The update operators for tables are `$set` and `$unset` (in particular, setting a column to None has the same effect as the $unset operator). Examples are `{"$set": {"round": 12}}` and `{"$unset": {"winner": "", "score": ""}}`. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> await my_async_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> await my_async_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... ) Note: a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": self._converter_agent.preprocess_payload(update), }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = await self._api_commander.async_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: # the contents are disregarded and the method just returns: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, ) async def delete_one( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. A row (at most one) is deleted if it matches that primary key. An example filter may be `{"match_id": "fight4", "round": 1}`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Count the rows matching a certain filter >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 3 >>> >>> # Delete a row belonging to the group >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # Count again >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # The count is unchanged >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = self._converter_agent.preprocess_payload( { "deleteOne": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteOne on '{self.name}'") do_response = await self._api_commander.async_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if do_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteOne API command.", raw_response=do_response, ) async def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table. Args: filter: a filter dictionary to specify which row(s) must be deleted. 1. If the filter is in the form `{"pk1": val1, "pk2": val2 ...}` and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter, `{}`, is passed, this operation empties the table completely. *USE WITH CARE*. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: - `{"pa1": x, "pa2": y, "ps1": z, "ps2": t}`: deletes one row - `{"pa1": x, "pa2": y, "ps1": z}`: deletes multiple rows - `{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}`: del. multiple rows - `{"pa1": x, "pa2": y}`: deletes all rows in the partition - `{}`: empties the table (*CAUTION*) Invalid filter examples: - `{"pa1": x}`: incomplete partition key - `{"pa1": x, "ps1" z}`: incomplete partition key (whatever is added) - `{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}`: inequality on a non-least-significant partitionSort column provided. - `{"pa1": x, "pa2": y, "ps2": t}`: cannot skip "ps1" general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Delete a single row (full primary key specified): >>> await my_async_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> await my_async_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> await my_async_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> await my_async_table.delete_many({}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) dm_payload = self._converter_agent.preprocess_payload( { "deleteMany": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteMany on '{self.name}'") dm_response = await self._api_commander.async_request( payload=dm_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteMany on '{self.name}'") if dm_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteMany API command.", raw_response=dm_response, ) async def drop( self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop the table, i.e. delete it from the database along with all the rows stored therein. Args: if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # List tables: >>> asyncio.run(my_async_table.database.list_table_names()) ['games'] >>> >>> # Drop this table: >>> asyncio.run(my_table.drop()) >>> >>> # List tables again: >>> asyncio.run(my_table.database.list_table_names()) [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> >>> async def try_use_table(): ... try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... >>> asyncio.run(try_use_table()) Collection does not exist [...] (COLLECTION_NOT_EXIST) Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping table '{self.name}' (self)") drop_result = await self.database.drop_table( self.name, if_exists=if_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping table '{self.name}' (self)") return drop_result async def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... })) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = await self._api_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
Ancestors
- typing.Generic
Instance variables
var database : AsyncDatabase
-
a Database object, the database this table belongs to.
Example
>>> my_async_table.database.name 'the_db'
Expand source code
@property def database(self) -> AsyncDatabase: """ a Database object, the database this table belongs to. Example: >>> my_async_table.database.name 'the_db' """ return self._database
var full_name : str
-
The fully-qualified table name within the database, in the form "keyspace.table_name".
Example
>>> my_async_table.full_name 'default_keyspace.my_table'
Expand source code
@property def full_name(self) -> str: """ The fully-qualified table name within the database, in the form "keyspace.table_name". Example: >>> my_async_table.full_name 'default_keyspace.my_table' """ return f"{self.keyspace}.{self.name}"
var keyspace : str
-
The keyspace this table is in.
Example
>>> my_async_table.keyspace 'default_keyspace'
Expand source code
@property def keyspace(self) -> str: """ The keyspace this table is in. Example: >>> my_async_table.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The table's DB is set with keyspace=None") return _keyspace
var name : str
-
The name of this table.
Example
>>> my_async_table.name 'my_table'
Expand source code
@property def name(self) -> str: """ The name of this table. Example: >>> my_async_table.name 'my_table' """ return self._name
Methods
async def alter(self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = dict[str, typing.Any], table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AsyncTable[~NEW_ROW]
-
Executes one of the available alter-table operations on this table, such as adding/dropping columns.
This is a blocking operation: the method returns once the index is created and ready to use.
Args
operation
- an instance of one of the
astrapy.info.AlterTable*
classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": …}}. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting AsyncTable is implicitly
an
AsyncTable[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = await my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = await new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = await new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import AsyncTable >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: AsyncTable[MyMatch] = await new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... )
Expand source code
async def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = DefaultRowType, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTable[NEW_ROW]: """ Executes one of the available alter-table operations on this table, such as adding/dropping columns. This is a blocking operation: the method returns once the index is created and ready to use. Args: operation: an instance of one of the `astrapy.info.AlterTable*` classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": ...}}. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting AsyncTable is implicitly an `AsyncTable[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = await my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = await new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = await new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import AsyncTable >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: AsyncTable[MyMatch] = await new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... ) """ n_operation: AlterTableOperation if isinstance(operation, AlterTableOperation): n_operation = operation else: n_operation = AlterTableOperation.from_full_dict(operation) _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) at_operation_name = n_operation._name at_payload = { "alterTable": { "operation": { at_operation_name: n_operation.as_dict(), }, }, } logger.info(f"alterTable({at_operation_name})") at_response = await self._api_commander.async_request( payload=at_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if at_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from alterTable API command.", raw_response=at_response, ) logger.info(f"finished alterTable({at_operation_name})") return AsyncTable( database=self.database, name=self.name, keyspace=self.keyspace, api_options=self.api_options, )
async def command(self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... })) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened
Expand source code
async def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... })) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = await self._api_commander.async_request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
async def count_documents(self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Count the row in the table matching the specified filter.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators.
upper_bound
- a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
the exact count of matching rows.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.insert_many([{"seq": i} for i in range(20)])) TableInsertManyResult(...) >>> asyncio.run(my_async_table.count_documents({}, upper_bound=100)) 20 >>> asyncio.run(my_async_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100)) 4 >>> asyncio.run(my_async_table.count_documents({}, upper_bound=10)) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException
Note
Count operations are expensive: for this reason, the best practice is to provide a reasonable
upper_bound
according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered.Expand source code
async def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the row in the table matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching rows. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.insert_many([{"seq": i} for i in range(20)])) TableInsertManyResult(...) >>> asyncio.run(my_async_table.count_documents({}, upper_bound=100)) 20 >>> asyncio.run(my_async_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100)) 4 >>> asyncio.run(my_async_table.count_documents({}, upper_bound=10)) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = await self._api_commander.async_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyRowsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyRowsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, )
async def create_index(self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Create an index on a non-vector column of the table.
This is a blocking operation: the method returns once the index is created and ready to use.
For creation of a vector index, see method
create_vector_index
instead.Args
name
- the name of the index. Index names must be unique across the keyspace.
column
- the table column on which the index is to be created.
options
- if passed, it must be an instance of
TableIndexOptions
, or an equivalent dictionary, which specifies index settings such as – for a text column – case-sensitivity and so on. See theTableIndexOptions
class for more details. if_not_exists
- if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> await my_async_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> await my_async_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... )
Expand source code
async def create_index( self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create an index on a non-vector column of the table. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a vector index, see method `create_vector_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column on which the index is to be created. options: if passed, it must be an instance of `TableIndexOptions`, or an equivalent dictionary, which specifies index settings such as -- for a text column -- case-sensitivity and so on. See the `astrapy.info.TableIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> await my_async_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> await my_async_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... ) """ ci_definition: dict[str, Any] = TableIndexDefinition( column=column, options=TableIndexOptions.coerce(options or {}), ).as_dict() ci_command = "createIndex" return await self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, )
async def create_vector_index(self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Create a vector index on a vector column of the table, enabling vector similarity search operations on it.
This is a blocking operation: the method returns once the index is created and ready to use.
For creation of a non-vector index, see method
create_index
instead.Args
name
- the name of the index. Index names must be unique across the keyspace.
column
- the table column, of type "vector" on which to create the index.
options
- an instance of
TableVectorIndexOptions
, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See theTableVectorIndexOptions
class for more details. if_not_exists
- if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of <code>if\_not\_exists</code>): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of <code>if\_not\_exists</code>): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... )
Expand source code
async def create_vector_index( self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create a vector index on a vector column of the table, enabling vector similarity search operations on it. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a non-vector index, see method `create_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column, of type "vector" on which to create the index. options: an instance of `TableVectorIndexOptions`, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See the `astrapy.info.TableVectorIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> await my_async_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... ) """ ci_definition: dict[str, Any] = TableVectorIndexDefinition( column=column, options=TableVectorIndexOptions.coerce(options), ).as_dict() ci_command = "createVectorIndex" return await self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, )
async def definition(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> ListTableDefinition
-
Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a
create_table
method call.Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
A
ListTableDefinition
object, available for inspection.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_table.definition()) ListTableDefinition(columns=[match_id,round,fighters, ... # shortened
Expand source code
async def definition( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ListTableDefinition: """ Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a `create_table` method call. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: A `ListTableDefinition` object, available for inspection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_table.definition()) ListTableDefinition(columns=[match_id,round,fighters, ... # shortened """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting tables in search of '{self.name}'") self_descriptors = [ table_desc for table_desc in await self.database._list_tables_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label, ), ) if table_desc.name == self.name ] logger.info(f"finished getting tables in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Table {self.keyspace}.{self.name} not found.", )
async def delete_many(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table.
Args
filter
- a filter dictionary to specify which row(s) must be deleted.
1. If the filter is in the form
{"pk1": val1, "pk2": val2 ...}
and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter,{}
, is passed, this operation empties the table completely. USE WITH CARE. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: -{"pa1": x, "pa2": y, "ps1": z, "ps2": t}
: deletes one row -{"pa1": x, "pa2": y, "ps1": z}
: deletes multiple rows -{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}
: del. multiple rows -{"pa1": x, "pa2": y}
: deletes all rows in the partition -{}
: empties the table (CAUTION) Invalid filter examples: -{"pa1": x}
: incomplete partition key -{"pa1": x, "ps1" z}
: incomplete partition key (whatever is added) -{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}
: inequality on a non-least-significant partitionSort column provided. -{"pa1": x, "pa2": y, "ps2": t}
: cannot skip "ps1" general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Delete a single row (full primary key specified): >>> await my_async_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> await my_async_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> await my_async_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> await my_async_table.delete_many({})
Expand source code
async def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table. Args: filter: a filter dictionary to specify which row(s) must be deleted. 1. If the filter is in the form `{"pk1": val1, "pk2": val2 ...}` and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter, `{}`, is passed, this operation empties the table completely. *USE WITH CARE*. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: - `{"pa1": x, "pa2": y, "ps1": z, "ps2": t}`: deletes one row - `{"pa1": x, "pa2": y, "ps1": z}`: deletes multiple rows - `{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}`: del. multiple rows - `{"pa1": x, "pa2": y}`: deletes all rows in the partition - `{}`: empties the table (*CAUTION*) Invalid filter examples: - `{"pa1": x}`: incomplete partition key - `{"pa1": x, "ps1" z}`: incomplete partition key (whatever is added) - `{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}`: inequality on a non-least-significant partitionSort column provided. - `{"pa1": x, "pa2": y, "ps2": t}`: cannot skip "ps1" general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Delete a single row (full primary key specified): >>> await my_async_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> await my_async_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> await my_async_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> await my_async_table.delete_many({}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) dm_payload = self._converter_agent.preprocess_payload( { "deleteMany": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteMany on '{self.name}'") dm_response = await self._api_commander.async_request( payload=dm_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteMany on '{self.name}'") if dm_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteMany API command.", raw_response=dm_response, )
async def delete_one(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing.
Args
filter
- a predicate expressing the table primary key in full,
i.e. a dictionary defining values for all columns that form the
primary key. A row (at most one) is deleted if it matches that primary
key. An example filter may be
{"match_id": "fight4", "round": 1}
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Count the rows matching a certain filter >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 3 >>> >>> # Delete a row belonging to the group >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # Count again >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # The count is unchanged >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2
Expand source code
async def delete_one( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. A row (at most one) is deleted if it matches that primary key. An example filter may be `{"match_id": "fight4", "round": 1}`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Count the rows matching a certain filter >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 3 >>> >>> # Delete a row belonging to the group >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # Count again >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> asyncio.run( ... my_async_table.delete_one({"match_id": "fight7", "round": 2}) ... ) >>> >>> # The count is unchanged >>> len(asyncio.run(my_async_table.find({"match_id": "fight7"}).to_list())) 2 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = self._converter_agent.preprocess_payload( { "deleteOne": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteOne on '{self.name}'") do_response = await self._api_commander.async_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if do_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteOne API command.", raw_response=do_response, )
async def distinct(self, key: str, *, filter: FilterType | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[typing.Any]
-
Return a list of the unique values of
key
across the rows in the table that match the provided filter.Args
key
- the name of the field whose value is inspected across rows.
Keys are typically just column names, although they can use
the dot notation to select particular entries in map columns.
For set and list columns, individual entries are "unrolled"
automatically; in particular, for lists, numeric indices
can be used in the key dot-notation syntax.
Example of acceptable
key
values: "a_column" "map_column.map_key" "list_column.2" filter
- a dictionary expressing which condition the inspected rows
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation (which may involve multiple API requests).
This method, being based on
find
(see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms
- a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a list of all different values for
key
found across the rows that match the filter. The result list has no repeated items.Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.distinct( ... "winner", ... filter={"match_id": "challenge6"}, ... )) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.distinct("winner")) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> asyncio.run(my_async_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> asyncio.run(my_async_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-...
Note
It must be kept in mind that
distinct
is a client-side operation, which effectively browses all required rows using the logic of thefind
method and collects the unique values found forkey
. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large.Note
For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the
find
command.Expand source code
async def distinct( self, key: str, *, filter: FilterType | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the rows in the table that match the provided filter. Args: key: the name of the field whose value is inspected across rows. Keys are typically just column names, although they can use the dot notation to select particular entries in map columns. For set and list columns, individual entries are "unrolled" automatically; in particular, for lists, numeric indices can be used in the key dot-notation syntax. Example of acceptable `key` values: "a_column" "map_column.map_key" "list_column.2" filter: a dictionary expressing which condition the inspected rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the rows that match the filter. The result list has no repeated items. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.distinct( ... "winner", ... filter={"match_id": "challenge6"}, ... )) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.distinct("winner")) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> asyncio.run(my_async_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> asyncio.run(my_async_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... )) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-... Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required rows using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncTableFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_shallow_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: AsyncTableFindCursor[dict[str, Any], dict[str, Any]] = ( AsyncTableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") async for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items
async def drop(self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Drop the table, i.e. delete it from the database along with all the rows stored therein.
Args
if_exists
- if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # List tables: >>> asyncio.run(my_async_table.database.list_table_names()) ['games'] >>> >>> # Drop this table: >>> asyncio.run(my_table.drop()) >>> >>> # List tables again: >>> asyncio.run(my_table.database.list_table_names()) [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> >>> async def try_use_table(): ... try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... >>> asyncio.run(try_use_table()) Collection does not exist [...] (COLLECTION_NOT_EXIST)
Note
Use with caution.
Note
Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further.
Expand source code
async def drop( self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Drop the table, i.e. delete it from the database along with all the rows stored therein. Args: if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # List tables: >>> asyncio.run(my_async_table.database.list_table_names()) ['games'] >>> >>> # Drop this table: >>> asyncio.run(my_table.drop()) >>> >>> # List tables again: >>> asyncio.run(my_table.database.list_table_names()) [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> >>> async def try_use_table(): ... try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... >>> asyncio.run(try_use_table()) Collection does not exist [...] (COLLECTION_NOT_EXIST) Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping table '{self.name}' (self)") drop_result = await self.database.drop_table( self.name, if_exists=if_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping table '{self.name}' (self)") return drop_result
async def estimated_document_count(self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Query the API server for an estimate of the document count in the table.
Contrary to
count_documents
, this method has no filtering parameters.Args
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a server-provided estimate count of the documents in the table.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.estimated_document_count()) 5820
Expand source code
async def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the table. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.estimated_document_count()) 5820 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = await self._api_commander.async_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, )
def find(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AsyncTableFindCursor[ROW, ROW2]
-
Find rows on the table matching the provided filters and according to sorting criteria including vector similarity.
The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the
TableFindCursor
documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism.Invoking
.to_list()
on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large.Args
filter
- a dictionary expressing which condition the returned rows
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter, not recommended for large tables),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection
- a prescription on which columns to return for the matching rows.
The projection can take the form
{"column1": True, "column2": True}
.{"*": True}
(i.e. return the whole row), or the complementary form that excludes columns:{"column1": False, "column2": False}
. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting cursor is implicitly an
AsyncTableFindCursor[ROW, ROW]
, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip
- if provided, it is a number of rows that would be obtained first in the response and are instead skipped.
limit
- a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table.
include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in each returned
row. It can be used meaningfully only in a vector search (see
sort
). include_sort_vector
- a boolean to request the search query vector.
If set to True (and if the search is a vector search), calling
the
get_sort_vector
method on the returned cursor will yield the vector used for the ANN search. sort
- this dictionary parameter controls the order in which the rows
are returned. The sort parameter can express either a vector search or
a regular (ascending/descending, even hierarchical) sorting.
* For a vector search the parameter takes the form
{"vector_column": qv}
, with the query vectorqv
of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form{"vectorize_enabled_column": "query text"}
is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are{"score": SortMode.ASCENDING}
(equivalently{"score": +1}
),{"score": +1, "when": -1}
. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms
- a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply.
timeout_ms
- an alias for
request_timeout_ms
.
Returns
a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed.
Note
As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Iterate over results: >>> async def loop1(): ... async for row in my_async_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop1()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> async def loop2(): ... async for row in my_async_table.find( ... {"match_id": "challenge6"}, ... projection=proj, ... ): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop2()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Filter on the partitioning: >>> asyncio.run( ... my_async_table.find({"match_id": "challenge6"}).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find( ... {"match_id": "challenge6", "round": 1} ... ).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"winner": "Caio Gozer"}).to_list()) [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"score": {"$gte": 15}}).to_list()) [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> asyncio.run(my_async_table.find({}).to_list()) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list()) [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list()) [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list()) [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and <code>limit</code>: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(asyncio.run( ... winner_cursor.map(lambda row: row["winner"].upper()).to_list()) ... )) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_async_cursor = my_async_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> asyncio.run(matches_async_cursor.has_next()) True >>> asyncio.run(matches_async_cursor.__anext__()) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_async_cursor.consumed 1 >>> matches_async_cursor.rewind() >>> matches_async_cursor.consumed 0 >>> asyncio.run(matches_async_cursor.has_next()) True >>> matches_async_cursor.close() >>> >>> async def try_consume(): ... try: ... await matches_async_cursor.__anext__() ... except StopAsyncIteration: ... print("StopAsyncIteration triggered.") ... >>> asyncio.run(try_consume()) StopAsyncIteration triggered.
Expand source code
def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AsyncTableFindCursor[ROW, ROW2]: """ Find rows on the table matching the provided filters and according to sorting criteria including vector similarity. The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the `TableFindCursor` documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism. Invoking `.to_list()` on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large. Args: filter: a dictionary expressing which condition the returned rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter, not recommended for large tables), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching rows. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly an `AsyncTableFindCursor[ROW, ROW]`, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: if provided, it is a number of rows that would be obtained first in the response and are instead skipped. limit: a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned row. It can be used meaningfully only in a vector search (see `sort`). include_sort_vector: a boolean to request the search query vector. If set to True (and if the search is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: this dictionary parameter controls the order in which the rows are returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `request_timeout_ms`. Returns: a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed. Note: As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Iterate over results: >>> async def loop1(): ... async for row in my_async_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop1()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> async def loop2(): ... async for row in my_async_table.find( ... {"match_id": "challenge6"}, ... projection=proj, ... ): ... print(f"(R:{row['round']}): winner {row['winner']}") ... >>> asyncio.run(loop2()) (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> >>> # Filter on the partitioning: >>> asyncio.run( ... my_async_table.find({"match_id": "challenge6"}).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find( ... {"match_id": "challenge6", "round": 1} ... ).to_list() ... ) [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"winner": "Caio Gozer"}).to_list()) [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find({"score": {"$gte": 15}}).to_list()) [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> asyncio.run(my_async_table.find({}).to_list()) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"round": 3, "winner": "Caio Gozer"} ... ).to_list()) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list()) [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list()) [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list()) [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and `limit`: >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list()) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_async_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(asyncio.run( ... winner_cursor.map(lambda row: row["winner"].upper()).to_list()) ... )) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_async_cursor = my_async_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> asyncio.run(matches_async_cursor.has_next()) True >>> asyncio.run(matches_async_cursor.__anext__()) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_async_cursor.consumed 1 >>> matches_async_cursor.rewind() >>> matches_async_cursor.consumed 0 >>> asyncio.run(matches_async_cursor.has_next()) True >>> matches_async_cursor.close() >>> >>> async def try_consume(): ... try: ... await matches_async_cursor.__anext__() ... except StopAsyncIteration: ... print("StopAsyncIteration triggered.") ... >>> asyncio.run(try_consume()) StopAsyncIteration triggered. """ # lazy-import here to avoid circular import issues from astrapy.cursors import AsyncTableFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( AsyncTableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) )
async def find_one(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~ROW]
-
Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none.
The parameters are analogous to some of the parameters to the
find
method (which has a few more that do not make sense in this case, such aslimit
).Args
filter
- a dictionary expressing which condition the returned row
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection
- a prescription on which columns to return for the matching row.
The projection can take the form
{"column1": True, "column2": True}
.{"*": True}
(i.e. return the whole row), or the complementary form that excludes columns:{"column1": False, "column2": False}
. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in the returned
row. It can be used meaningfully only in a vector search (see
sort
). sort
- this dictionary parameter controls the sorting order, hence determines
which row is being returned.
The sort parameter can express either a vector search or
a regular (ascending/descending, even hierarchical) sorting.
* For a vector search the parameter takes the form
{"vector_column": qv}
, with the query vectorqv
of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form{"vectorize_enabled_column": "query text"}
is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are{"score": SortMode.ASCENDING}
(equivalently{"score": +1}
),{"score": +1, "when": -1}
. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary expressing the result if a row is found, otherwise None.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> asyncio.run(my_async_table.find_one({"match_id": "challenge6"})) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(asyncio.run(my_async_table.find_one({"match_id": "not_real"}))) 'None' >>> >>> # Optimize bandwidth using a projection: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... )) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find_one({"match_id": "challenge6", "round": 1}) ... ) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"winner": "Caio Gozer"})) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"score": {"$gte": 15}})) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> asyncio.run(my_async_table.find_one({})) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run( ... my_async_table.find_one({"round": 3, "winner": "Caio Gozer"}) ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> asyncio.run(my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... )) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... )) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... )) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'}
Expand source code
async def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ROW | None: """ Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none. The parameters are analogous to some of the parameters to the `find` method (which has a few more that do not make sense in this case, such as `limit`). Args: filter: a dictionary expressing which condition the returned row must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching row. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned row. It can be used meaningfully only in a vector search (see `sort`). sort: this dictionary parameter controls the sorting order, hence determines which row is being returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the result if a row is found, otherwise None. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> asyncio.run(my_async_table.find_one({"match_id": "challenge6"})) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(asyncio.run(my_async_table.find_one({"match_id": "not_real"}))) 'None' >>> >>> # Optimize bandwidth using a projection: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... )) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> asyncio.run( ... my_async_table.find_one({"match_id": "challenge6", "round": 1}) ... ) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"winner": "Caio Gozer"})) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> asyncio.run(my_async_table.find_one({"score": {"$gte": 15}})) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> asyncio.run(my_async_table.find_one({})) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... )) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> asyncio.run( ... my_async_table.find_one({"round": 3, "winner": "Caio Gozer"}) ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> asyncio.run(my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... )) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> asyncio.run(my_async_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... )) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... )) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> asyncio.run(my_async_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... )) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = self._converter_agent.preprocess_payload( { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } ) fo_response = await self._api_commander.async_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'document'.", raw_response=fo_response, ) if "projectionSchema" not in (fo_response.get("status") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'projectionSchema'.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return self._converter_agent.postprocess_row( fo_response["data"]["document"], columns_dict=fo_response["status"]["projectionSchema"], similarity_pseudocolumn="$similarity" if include_similarity else None, )
async def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInfo
-
Return information on the table. This should not be confused with the table definition (i.e. the schema).
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A TableInfo object for inspection.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Note: output reformatted for clarity. >>> asyncio.run(my_async_table.info()) TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' )
Expand source code
async def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInfo: """ Return information on the table. This should not be confused with the table definition (i.e. the schema). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A TableInfo object for inspection. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Note: output reformatted for clarity. >>> asyncio.run(my_async_table.info()) TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' ) """ db_info = await self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return TableInfo( database_info=db_info, keyspace=self.keyspace, name=self.name, full_name=self.full_name, )
async def insert_many(self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInsertManyResult
-
Insert a number of rows into the table, with implied overwrite in case of primary key collision.
Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e.
None
,{}
or analogous.Args
rows
- an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null.
ordered
- if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster.
chunk_size
- how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default.
concurrency
- maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply.
request_timeout_ms
- a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... )) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> asyncio.run(my_async_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... )) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ...
Note
Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important.
Note
If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the
ordered
parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of typeTableInsertManyException
: itspartial_result
attribute is precisely aTableInsertManyResult
, encoding details on the successful writes.Expand source code
async def insert_many( self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertManyResult: """ Insert a number of rows into the table, with implied overwrite in case of primary key collision. Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: rows: an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster. chunk_size: how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... )) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> asyncio.run(my_async_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... )) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ... Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important. Note: If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the `ordered` parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of type `astrapy.exceptions.TableInsertManyException`: its `partial_result` attribute is precisely a `TableInsertManyResult`, encoding details on the successful writes. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _rows = list(rows) logger.info(f"inserting {len(_rows)} rows in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] inserted_id_tuples: list[Any] = [] for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = await self._api_commander.async_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids, chunk_inserted_ids_tuples = ( self._prepare_keys_from_status(chunk_response.get("status")) ) inserted_ids += chunk_inserted_ids inserted_id_tuples += chunk_inserted_ids_tuples raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} sem = asyncio.Semaphore(_concurrency) async def concurrent_insert_chunk( row_chunk: list[ROW], ) -> dict[str, Any]: async with sem: im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": row_chunk, "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = await self._api_commander.async_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response if _concurrency > 1: tasks = [ asyncio.create_task( concurrent_insert_chunk(_rows[i : i + _chunk_size]) ) for i in range(0, len(_rows), _chunk_size) ] raw_results = await asyncio.gather(*tasks) else: raw_results = [ await concurrent_insert_chunk(_rows[i : i + _chunk_size]) for i in range(0, len(_rows), _chunk_size) ] # recast raw_results. Each response has its schema: unfold appropriately ids_and_tuples_per_chunk = [ self._prepare_keys_from_status(chunk_response.get("status")) for chunk_response in raw_results ] inserted_ids = [ inserted_id for chunk_ids, _ in ids_and_tuples_per_chunk for inserted_id in chunk_ids ] inserted_id_tuples = [ inserted_id_tuple for _, chunk_id_tuples in ids_and_tuples_per_chunk for inserted_id_tuple in chunk_id_tuples ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result
async def insert_one(self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInsertOneResult
-
Insert a single row in the table, with implied overwrite in case of primary key collision.
Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e.
None
,{}
or analogous.Args
row
- a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... )) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ...
Expand source code
async def insert_one( self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertOneResult: """ Insert a single row in the table, with implied overwrite in case of primary key collision. Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: row: a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... )) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> asyncio.run(my_async_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... )) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ... """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = self._converter_agent.preprocess_payload( {"insertOne": {"document": row}} ) logger.info(f"insertOne on '{self.name}'") io_response = await self._api_commander.async_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if not io_response["status"]["insertedIds"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'insertedIds'.", raw_response=io_response, ) if not io_response["status"]["primaryKeySchema"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'primaryKeySchema'.", raw_response=io_response, ) inserted_id_list = io_response["status"]["insertedIds"][0] inserted_id_tuple, inserted_id = self._converter_agent.postprocess_key( inserted_id_list, primary_key_schema_dict=io_response["status"]["primaryKeySchema"], ) return TableInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, inserted_id_tuple=inserted_id_tuple, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insertOne API command.", raw_response=io_response, )
async def list_index_names(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all indexes existing on this table.
Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of the index names as strings, in no particular order.
Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.list_index_names()) ['m_vector_index', 'winner_index', 'score_index']
Expand source code
async def list_index_names( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the index names as strings, in no particular order. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> asyncio.run(my_async_table.list_index_names()) ['m_vector_index', 'winner_index', 'score_index'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {}}} logger.info("listIndexes") li_response = await self._api_commander.async_request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return li_response["status"]["indexes"] # type: ignore[no-any-return]
async def list_indexes(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[TableIndexDescriptor]
-
List the full definitions of all indexes existing on this table.
Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of
TableIndexDescriptor
objects in no particular order, each providing the details of an index present on the table.Example
>>> # NOTE: may require slight adaptation to an async context. >>> >>> indexes = asyncio.run(my_async_table.list_indexes()) >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] >>> # (Note: shortened output above) >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False
Expand source code
async def list_indexes( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[TableIndexDescriptor]: """ List the full definitions of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of `astrapy.info.TableIndexDescriptor` objects in no particular order, each providing the details of an index present on the table. Example: >>> # NOTE: may require slight adaptation to an async context. >>> >>> indexes = asyncio.run(my_async_table.list_indexes()) >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] >>> # (Note: shortened output above) >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {"explain": True}}} logger.info("listIndexes") li_response = await self._api_commander.async_request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return [ TableIndexDescriptor.coerce(index_object) for index_object in li_response["status"]["indexes"] ]
def to_sync(self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Table[ROW]
-
Create a Table from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object).
Args
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, a Table instance.
Example
>>> my_async_table.to_sync().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... ) {"pk": 1, "column": "value}
Expand source code
def to_sync( self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Create a Table from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, a Table instance. Example: >>> my_async_table.to_sync().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... ) {"pk": 1, "column": "value} """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Table( database=self.database.to_sync(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, )
async def update_one(self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found.
Args
filter
- a predicate expressing the table primary key in full,
i.e. a dictionary defining values for all columns that form the
primary key. An example may be
{"match_id": "fight4", "round": 1}
. update
- the update prescription to apply to the row, expressed
as a dictionary conforming to the Data API syntax. The update
operators for tables are
$set
and$unset
(in particular, setting a column to None has the same effect as the $unset operator). Examples are{"$set": {"round": 12}}
and{"$unset": {"winner": "", "score": ""}}
. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> await my_async_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> await my_async_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... )
Note
a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns.
Expand source code
async def update_one( self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. An example may be `{"match_id": "fight4", "round": 1}`. update: the update prescription to apply to the row, expressed as a dictionary conforming to the Data API syntax. The update operators for tables are `$set` and `$unset` (in particular, setting a column to None has the same effect as the $unset operator). Examples are `{"$set": {"round": 12}}` and `{"$unset": {"winner": "", "score": ""}}`. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # NOTE: may require slight adaptation to an async context. >>> >>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> await my_async_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> await my_async_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> await my_async_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... ) Note: a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": self._converter_agent.preprocess_payload(update), }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = await self._api_commander.async_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: # the contents are disregarded and the method just returns: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, )
def with_options(self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncTable[ROW]
-
Create a clone of this table with some changed attributes.
Args
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new AsyncTable instance.
Example
>>> table_with_api_key_configured = my_async_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... )
Expand source code
def with_options( self: AsyncTable[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Create a clone of this table with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new AsyncTable instance. Example: >>> table_with_api_key_configured = my_async_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, )
class Collection (*, database: Database, name: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API collection, the object to interact with the Data API for unstructured (schemaless) data, especially for DDL operations. This class has a synchronous interface.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_collection
of Database, wherefrom the Collection inherits its API options such as authentication token and API endpoint.Args
database
- a Database object, instantiated earlier. This represents the database the collection belongs to.
name
- the collection name. This parameter should match an existing collection on the database.
keyspace
- this is the keyspace to which the collection belongs. If nothing is specified, the database's working keyspace is used.
api_options
- a complete specification of the API Options for this instance.
Examples
>>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> database = client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... )
>>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = database.create_collection( ... "my_events", ... definition=collection_definition, ... )
>>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>>
>>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = database.create_collection( ... "my_events", ... definition=collection_definition_2, ... )
>>> # Get a reference to an existing collection >>> # (no checks are performed on DB) >>> my_collection_3a = database.get_collection("my_events") >>> my_collection_3b = database.my_events >>> my_collection_3c = database["my_events"]
Note
creating an instance of Collection does not trigger actual creation of the collection on the database. The latter should have been created beforehand, e.g. through the
create_collection
method of a Database.Expand source code
class Collection(Generic[DOC]): """ A Data API collection, the object to interact with the Data API for unstructured (schemaless) data, especially for DDL operations. This class has a synchronous interface. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_collection` of Database, wherefrom the Collection inherits its API options such as authentication token and API endpoint. Args: database: a Database object, instantiated earlier. This represents the database the collection belongs to. name: the collection name. This parameter should match an existing collection on the database. keyspace: this is the keyspace to which the collection belongs. If nothing is specified, the database's working keyspace is used. api_options: a complete specification of the API Options for this instance. Examples: >>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> database = client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... ) >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = database.create_collection( ... "my_events", ... definition=collection_definition, ... ) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = database.create_collection( ... "my_events", ... definition=collection_definition_2, ... ) >>> # Get a reference to an existing collection >>> # (no checks are performed on DB) >>> my_collection_3a = database.get_collection("my_events") >>> my_collection_3b = database.my_events >>> my_collection_3c = database["my_events"] Note: creating an instance of Collection does not trigger actual creation of the collection on the database. The latter should have been created beforehand, e.g. through the `create_collection` method of a Database. """ def __init__( self, *, database: Database, name: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self._name = name _keyspace = keyspace if keyspace is not None else database.keyspace if _keyspace is None: raise ValueError("Attempted to create Collection with 'keyspace' unset.") self._database = database._copy( keyspace=_keyspace, api_options=self.api_options ) self._commander_headers = { **{DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token()}, **self.api_options.embedding_api_key.get_headers(), **self.api_options.database_additional_headers, } self._api_commander = self._get_api_commander() def __repr__(self) -> str: _db_desc = f'database.api_endpoint="{self.database.api_endpoint}"' return ( f'{self.__class__.__name__}(name="{self.name}", ' f'keyspace="{self.keyspace}", {_db_desc}, ' f"api_options={self.api_options})" ) def __eq__(self, other: Any) -> bool: if isinstance(other, Collection): return all( [ self._name == other._name, self._database == other._database, self.api_options == other.api_options, ] ) else: return False def __call__(self, *pargs: Any, **kwargs: Any) -> None: raise TypeError( f"'{self.__class__.__name__}' object is not callable. If you " f"meant to call the '{self.name}' method on a " f"'{self.database.__class__.__name__}' object " "it is failing because no such method exists." ) def _get_api_commander(self) -> APICommander: """Instantiate a new APICommander based on the properties of this class.""" if self._database.keyspace is None: raise ValueError( "No keyspace specified. Collection requires a keyspace to " "be set, e.g. through the `keyspace` constructor parameter." ) base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self._database.api_options.data_api_url_options.api_path, self._database.api_options.data_api_url_options.api_version, self._database.keyspace, self._name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self._database.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, handle_decimals_writes=( self.api_options.serdes_options.use_decimals_in_collections ), handle_decimals_reads=( self.api_options.serdes_options.use_decimals_in_collections ), ) return api_commander def _converted_request( self, *, http_method: str = HttpMethod.POST, payload: dict[str, Any] | None = None, additional_path: str | None = None, request_params: dict[str, Any] = {}, raise_api_errors: bool = True, timeout_context: _TimeoutContext, ) -> dict[str, Any]: converted_payload = preprocess_collection_payload( payload, options=self.api_options.serdes_options ) raw_response_json = self._api_commander.request( http_method=http_method, payload=converted_payload, additional_path=additional_path, request_params=request_params, raise_api_errors=raise_api_errors, timeout_context=timeout_context, ) response_json = postprocess_collection_response( raw_response_json, options=self.api_options.serdes_options ) return response_json def _copy( self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Collection( database=self.database, name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def with_options( self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Create a clone of this collection with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new Collection instance. Example: >>> collection_with_api_key_configured = my_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, ) def to_async( self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Create an AsyncCollection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an AsyncCollection instance. Example: >>> asyncio.run(my_coll.to_async().count_documents({},upper_bound=100)) 77 """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncCollection( database=self.database.to_async(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def options( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDefinition: """ Get the collection options, i.e. its configuration as read from the database. The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a CollectionDefinition instance describing the collection. (See also the database `list_collections` method.) Example: >>> my_coll.options() CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine')) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting collections in search of '{self.name}'") self_descriptors = [ coll_desc for coll_desc in self.database._list_collections_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label, ), ) if coll_desc.name == self.name ] logger.info(f"finished getting collections in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Collection {self.keyspace}.{self.name} not found.", ) def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInfo: """ Information on the collection (name, location, database), in the form of a CollectionInfo object. Not to be confused with the collection `options` method (related to the collection internal configuration). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> my_coll.info().database_info.region 'eu-west-1' >>> my_coll.info().full_name 'default_keyspace.my_v_collection' Note: the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for `Database.info()` for more details. """ return CollectionInfo( database_info=self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ), keyspace=self.keyspace, name=self.name, full_name=self.full_name, ) @property def database(self) -> Database: """ a Database object, the database this collection belongs to. Example: >>> my_coll.database.name 'the_application_database' """ return self._database @property def keyspace(self) -> str: """ The keyspace this collection is in. Example: >>> my_coll.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The collection's DB is set with keyspace=None") return _keyspace @property def name(self) -> str: """ The name of this collection. Example: >>> my_coll.name 'my_v_collection' """ return self._name @property def full_name(self) -> str: """ The fully-qualified collection name within the database, in the form "keyspace.collection_name". Example: >>> my_coll.full_name 'default_keyspace.my_v_collection' """ return f"{self.keyspace}.{self.name}" def insert_one( self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertOneResult: """ Insert a single document in the collection in an atomic operation. Args: document: the dictionary expressing the document to insert. The `_id` field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertOneResult object. Examples: >>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) CollectionInsertOneResult(raw_results=..., inserted_id='ed4587a4-...-...-...') >>> my_coll.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) CollectionInsertOneResult(raw_results=..., inserted_id='user-123') >>> my_coll.count_documents({}, upper_bound=10) 2 >>> my_coll.insert_one({"tag": "v", "$vector": [10, 11]}) CollectionInsertOneResult(...) Note: If an `_id` is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = {"insertOne": {"document": document}} logger.info(f"insertOne on '{self.name}'") io_response = self._converted_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if io_response["status"]["insertedIds"]: inserted_id = io_response["status"]["insertedIds"][0] return CollectionInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insert_one API command.", raw_response=io_response, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insert_one API command.", raw_response=io_response, ) def insert_many( self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertManyResult: """ Insert a list of documents into the collection. This is not an atomic operation. Args: documents: an iterable of dictionaries, each a document to insert. Documents may specify their `_id` field or leave it out, in which case it will be added automatically. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster. chunk_size: how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertManyResult object. Examples: >>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_many( ... [{"a": 10}, {"a": 5}, {"b": [True, False, False]}], ... ordered=True, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['184bb06f-...', '...', '...']) >>> my_coll.count_documents({}, upper_bound=100) 3 >>> my_coll.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=[... ...]) >>> my_coll.count_documents({}, upper_bound=100) 53 >>> my_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... ) CollectionInsertManyResult(...) Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important. Note: A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an `_id` already present on the collection, or its vector dimension may not match the collection setting. For an ordered insertion, the method will raise an exception at the first such faulty document -- nevertheless, all documents processed until then will end up being written to the database. For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised -- and all insertable documents will have been written to the database, including those "after" the troublesome ones. If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole `insert_many` operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _documents = list(documents) logger.info(f"inserting {len(_documents)} documents in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids = (chunk_response.get("status") or {}).get( "insertedIds", [] ) inserted_ids += chunk_inserted_ids raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} if _concurrency > 1: with ThreadPoolExecutor(max_workers=_concurrency) as executor: def _chunk_insertor( document_chunk: list[dict[str, Any]], ) -> dict[str, Any]: im_payload = { "insertMany": { "documents": document_chunk, "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response raw_results = list( executor.map( _chunk_insertor, ( _documents[i : i + _chunk_size] for i in range(0, len(_documents), _chunk_size) ), ) ) else: for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") raw_results.append(im_response) # recast raw_results inserted_ids = [ inserted_id for chunk_response in raw_results for inserted_id in (chunk_response.get("status") or {}).get( "insertedIds", [] ) ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionFindCursor[DOC, DOC]: ... @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2], skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionFindCursor[DOC, DOC2]: ... def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionFindCursor[DOC, DOC2]: """ Find documents on the collection, matching a certain provided filter. The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the `find` method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of the `sort` option in a non-vector-search manner. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly a `CollectionFindCursor[DOC, DOC]`, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: with this integer parameter, what would be the first `skip` documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicit `sort` criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit: this (integer) parameter sets a limit over how many documents are returned. Once `limit` is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. include_sort_vector: a boolean to request the search query vector. If set to True (and if the invocation is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting, as well as the one about upper bounds, for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. request_timeout_ms: a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `request_timeout_ms`. Returns: a Cursor object representing iterations over the matching documents (see the Cursor object for how to use it. The simplest thing is to run a for loop: `for document in collection.sort(...):`). Examples: >>> filter = {"seq": {"$exists": True}} >>> for doc in my_coll.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... 37 35 10 36 27 >>> cursor1 = my_coll.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) >>> [doc["_id"] for doc in cursor1] ['97e85f81-...', '1581efe4-...', '...', '...'] >>> cursor2 = my_coll.find({}, limit=3) >>> cursor2.distinct("seq") [37, 35, 10] >>> my_coll.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) >>> ann_tags = [ ... document["tag"] ... for document in my_coll.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] >>> ann_tags ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE) >>> cursor = my_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> cursor.get_sort_vector() [3.0, 3.0] >>> matches = list(cursor) >>> cursor.get_sort_vector() [3.0, 3.0] Note: The following are example values for the `sort` parameter. When no particular order is required: sort={} # (default when parameter not provided) When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ a `collections.OrderedDict` in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]} Note: Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as `.distinct()` on a cursor. Note: When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the `find` was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor. """ # lazy-import here to avoid circular import issues from astrapy.cursors import CollectionFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( CollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) ) def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Run a search, returning the first document in the collection that matches provided filters, if any is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the required document, otherwise None. Examples: >>> my_coll.find_one({}) {'_id': '68d1e515-...', 'seq': 37} >>> my_coll.find_one({"seq": 10}) {'_id': 'd560e217-...', 'seq': 10} >>> my_coll.find_one({"seq": 1011}) >>> # (returns None for no matches) >>> my_coll.find_one({}, projection={"seq": False}) {'_id': '68d1e515-...'} >>> my_coll.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) {'_id': '97e85f81-...', 'seq': 69} >>> my_coll.find_one({}, sort={"$vector": [1, 0]}, projection={"*": True}) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]} Note: See the `find` method for more details on the accepted parameters (whereas `skip` and `limit` are not valid parameters for `find_one`). """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findOne API command.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return fo_response["data"]["document"] # type: ignore[no-any-return] def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the documents in the collection that match the provided filter. Args: key: the name of the field whose value is inspected across documents. Keys can use dot-notation to descend to deeper document levels. Example of acceptable `key` values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms: a timeout, in milliseconds, for each API request. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the documents that match the filter. The result list has no repeated items. Example: >>> my_coll.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['c5b99f37-...', 'd6416321-...']) >>> my_coll.distinct("name") ['Marco', 'Emma'] >>> my_coll.distinct("city") ['Helsinki'] >>> my_coll.distinct("food") ['apple', 'orange', {'likes_fruit': True, 'allergies': []}] >>> my_coll.distinct("food.1") ['orange'] >>> my_coll.distinct("food.allergies") [] >>> my_coll.distinct("food.likes_fruit") [True] Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required documents using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import CollectionFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: CollectionFindCursor[dict[str, Any], dict[str, Any]] = ( CollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the documents in the collection matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching documents. Example: >>> my_coll.insert_many([{"seq": i} for i in range(20)]) CollectionInsertManyResult(...) >>> my_coll.count_documents({}, upper_bound=100) 20 >>> my_coll.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_coll.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = self._converted_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyDocumentsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyDocumentsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, ) def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the collection. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the collection. Example: >>> my_coll.estimated_document_count() 35700 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = self._converted_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, ) def find_one_and_replace( self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the `return_document` parameter). Example: >>> my_coll.insert_one({"_id": "rule1", "text": "all animals are equal"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) {'_id': 'rule1', 'text': 'all animals are equal'} >>> my_coll.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'rule1', 'text': 'and the pigs are the rulers'} >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) {'text': 'F=ma'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, ) def replace_one( self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Replace a single document on the collection with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the replace operation. Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.replace_one({"Marco": {"$exists": True}}, {"Buda": "Pest"}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.find_one({"Buda": "Pest"}) {'_id': '8424905a-...', 'Buda': 'Pest'} >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '931b47d6-...'}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): fo_status = fo_response.get("status") or {} _update_info = _prepare_update_info([fo_status]) return CollectionUpdateResult( raw_results=[fo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, ) def find_one_and_update( self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and update it as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the `return_document` parameter). Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) {'_id': 'a80106f2-...', 'Marco': 'Polo'} >>> my_coll.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'a80106f2-...', 'title': 'Mr.', 'rank': 3} >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'cb4ef2ab-...', 'name': 'Johnny', 'rank': 0} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndUpdate": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, "projection": normalize_optional_projection(projection), }.items() if v is not None } } logger.info(f"findOneAndUpdate on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndUpdate on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_update API command.", raw_response=fo_response, ) def update_one( self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Update a single document on the collection as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.update_one({"Marco": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '2a45ff60-...'}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = self._converted_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: uo_status = uo_response["status"] _update_info = _prepare_update_info([uo_status]) return CollectionUpdateResult( raw_results=[uo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, ) def update_many( self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. upsert: this parameter controls the behavior in absence of matches. If True, a single new document (resulting from applying `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> my_coll.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) CollectionInsertManyResult(...) >>> my_coll.update_many({"c": {"$ne": "green"}}, {"$set": {"nongreen": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2}) >>> my_coll.update_many({"c": "orange"}, {"$set": {"is_also_fruit": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '46643050-...'}) Note: Similarly to the case of `find` (see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of the `update_many` can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) api_options = { "upsert": upsert, } page_state_options: dict[str, str] = {} um_responses: list[dict[str, Any]] = [] um_statuses: list[dict[str, Any]] = [] must_proceed = True logger.info(f"starting update_many on '{self.name}'") timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) while must_proceed: options = {**api_options, **page_state_options} this_um_payload = { "updateMany": { k: v for k, v in { "filter": filter, "update": update, "options": options, }.items() if v is not None } } logger.info(f"updateMany on '{self.name}'") this_um_response = self._converted_request( payload=this_um_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished updateMany on '{self.name}'") this_um_status = this_um_response.get("status") or {} # # if errors, quit early if this_um_response.get("errors", []): partial_update_info = _prepare_update_info(um_statuses) partial_result = CollectionUpdateResult( raw_results=um_responses, update_info=partial_update_info, ) all_um_responses = um_responses + [this_um_response] raise CollectionUpdateManyException.from_responses( commands=[None for _ in all_um_responses], raw_responses=all_um_responses, partial_result=partial_result, ) else: if "status" not in this_um_response: raise UnexpectedDataAPIResponseException( text="Faulty response from update_many API command.", raw_response=this_um_response, ) um_responses.append(this_um_response) um_statuses.append(this_um_status) next_page_state = this_um_status.get("nextPageState") if next_page_state is not None: must_proceed = True page_state_options = {"pageState": next_page_state} else: must_proceed = False page_state_options = {} update_info = _prepare_update_info(um_statuses) logger.info(f"finished update_many on '{self.name}'") return CollectionUpdateResult( raw_results=um_responses, update_info=update_info, ) def find_one_and_delete( self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document in the collection and delete it. The deleted document, however, is the return value of the method. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the deleted one. See the `find` method for more on sorting. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: Either the document (or a projection thereof, as requested), or None if no matches were found in the first place. Example: >>> my_coll.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) CollectionInsertManyResult(...) >>> my_coll.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) {'_id': '5997fb48-...', 'species': 'swan'} >>> my_coll.find_one_and_delete({"species": {"$ne": "frog"}}) >>> # (returns None for no matches) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _projection = normalize_optional_projection(projection) fo_payload = { "findOneAndDelete": { k: v for k, v in { "filter": filter, "sort": sort, "projection": _projection, }.items() if v is not None } } logger.info(f"findOneAndDelete on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndDelete on '{self.name}'") if "document" in fo_response.get("data", {}): document = fo_response["data"]["document"] return document # type: ignore[no-any-return] else: deleted_count = fo_response.get("status", {}).get("deletedCount") if deleted_count == 0: return None else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_delete API command.", raw_response=fo_response, ) def delete_one( self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the deleted one. See the `find` method for more on sorting. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_one({"seq": 1}) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0, 2] >>> my_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0] >>> my_coll.delete_one({"seq": 2}) CollectionDeleteResult(raw_results=..., deleted_count=0) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = { "deleteOne": { k: v for k, v in { "filter": filter, "sort": sort, }.items() if v is not None } } logger.info(f"deleteOne on '{self.name}'") do_response = self._converted_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if "deletedCount" in do_response.get("status", {}): deleted_count = do_response["status"]["deletedCount"] return CollectionDeleteResult( deleted_count=deleted_count, raw_results=[do_response], ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_one API command.", raw_response=do_response, ) def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete all documents matching a provided filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. Passing an empty filter, `{}`, completely erases all contents of the collection. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=2) >>> my_coll.distinct("seq") [2] >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=0) Note: This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the `filter={}` case, whereby the operation is atomic. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) dm_responses: list[dict[str, Any]] = [] deleted_count = 0 must_proceed = True timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) this_dm_payload = {"deleteMany": {"filter": filter}} logger.info(f"starting delete_many on '{self.name}'") while must_proceed: logger.info(f"deleteMany on '{self.name}'") this_dm_response = self._converted_request( payload=this_dm_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished deleteMany on '{self.name}'") # if errors, quit early if this_dm_response.get("errors", []): partial_result = CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) all_dm_responses = dm_responses + [this_dm_response] raise CollectionDeleteManyException.from_responses( commands=[None for _ in all_dm_responses], raw_responses=all_dm_responses, partial_result=partial_result, ) else: this_dc = this_dm_response.get("status", {}).get("deletedCount") if this_dc is None: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_many API command.", raw_response=this_dm_response, ) dm_responses.append(this_dm_response) deleted_count += this_dc must_proceed = this_dm_response.get("status", {}).get("moreData", False) logger.info(f"finished delete_many on '{self.name}'") return CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) def drop( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the collection, i.e. delete it from the database along with all the documents it contains. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> my_coll.find_one({}) {'_id': '...', 'a': 100} >>> my_coll.drop() >>> my_coll.find_one({}) Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, collection name: my_collection Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping collection '{self.name}' (self)") self.database.drop_collection( self.name, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping collection '{self.name}' (self)") def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_coll.command({"countDocuments": {}}) {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = self._api_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
Ancestors
- typing.Generic
Instance variables
var database : Database
-
a Database object, the database this collection belongs to.
Example
>>> my_coll.database.name 'the_application_database'
Expand source code
@property def database(self) -> Database: """ a Database object, the database this collection belongs to. Example: >>> my_coll.database.name 'the_application_database' """ return self._database
var full_name : str
-
The fully-qualified collection name within the database, in the form "keyspace.collection_name".
Example
>>> my_coll.full_name 'default_keyspace.my_v_collection'
Expand source code
@property def full_name(self) -> str: """ The fully-qualified collection name within the database, in the form "keyspace.collection_name". Example: >>> my_coll.full_name 'default_keyspace.my_v_collection' """ return f"{self.keyspace}.{self.name}"
var keyspace : str
-
The keyspace this collection is in.
Example
>>> my_coll.keyspace 'default_keyspace'
Expand source code
@property def keyspace(self) -> str: """ The keyspace this collection is in. Example: >>> my_coll.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The collection's DB is set with keyspace=None") return _keyspace
var name : str
-
The name of this collection.
Example
>>> my_coll.name 'my_v_collection'
Expand source code
@property def name(self) -> str: """ The name of this collection. Example: >>> my_coll.name 'my_v_collection' """ return self._name
Methods
def command(self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> my_coll.command({"countDocuments": {}}) {'status': {'count': 123}}
Expand source code
def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this collection with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_coll.command({"countDocuments": {}}) {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = self._api_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
def count_documents(self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Count the documents in the collection matching the specified filter.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
upper_bound
- a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
the exact count of matching documents.
Example
>>> my_coll.insert_many([{"seq": i} for i in range(20)]) CollectionInsertManyResult(...) >>> my_coll.count_documents({}, upper_bound=100) 20 >>> my_coll.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_coll.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException
Note
Count operations are expensive: for this reason, the best practice is to provide a reasonable
upper_bound
according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered.Expand source code
def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the documents in the collection matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of documents exceeds this value, an exception will be raised. Furthermore, if the actual number of documents exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching documents. Example: >>> my_coll.insert_many([{"seq": i} for i in range(20)]) CollectionInsertManyResult(...) >>> my_coll.count_documents({}, upper_bound=100) 20 >>> my_coll.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_coll.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyDocumentsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of documents (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of documents it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = self._converted_request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyDocumentsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyDocumentsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, )
def delete_many(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDeleteResult
-
Delete all documents matching a provided filter.
Args
filter
- a predicate expressed as a dictionary according to the
Data API filter syntax. Examples are:
{}
{"name": "John"}
{"price": {"$lt": 100}}
{"$and": [{"name": "John"}, {"price": {"$lt": 100}}]}
See the Data API documentation for the full set of operators.
Passing an empty filter,
{}
, completely erases all contents of the collection. general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionDeleteResult object summarizing the outcome of the delete operation.
Example
>>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=2) >>> my_coll.distinct("seq") [2] >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=0)
Note
This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the
filter={}
case, whereby the operation is atomic.Expand source code
def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete all documents matching a provided filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. Passing an empty filter, `{}`, completely erases all contents of the collection. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=2) >>> my_coll.distinct("seq") [2] >>> my_coll.delete_many({"seq": {"$lte": 1}}) CollectionDeleteResult(raw_results=..., deleted_count=0) Note: This operation is in general not atomic. Depending on the amount of matching documents, it can keep running (in a blocking way) for a macroscopic time. In that case, new documents that are meanwhile inserted (e.g. from another process/application) will be deleted during the execution of this method call until the collection is devoid of matches. An exception is the `filter={}` case, whereby the operation is atomic. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) dm_responses: list[dict[str, Any]] = [] deleted_count = 0 must_proceed = True timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) this_dm_payload = {"deleteMany": {"filter": filter}} logger.info(f"starting delete_many on '{self.name}'") while must_proceed: logger.info(f"deleteMany on '{self.name}'") this_dm_response = self._converted_request( payload=this_dm_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished deleteMany on '{self.name}'") # if errors, quit early if this_dm_response.get("errors", []): partial_result = CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, ) all_dm_responses = dm_responses + [this_dm_response] raise CollectionDeleteManyException.from_responses( commands=[None for _ in all_dm_responses], raw_responses=all_dm_responses, partial_result=partial_result, ) else: this_dc = this_dm_response.get("status", {}).get("deletedCount") if this_dc is None: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_many API command.", raw_response=this_dm_response, ) dm_responses.append(this_dm_response) deleted_count += this_dc must_proceed = this_dm_response.get("status", {}).get("moreData", False) logger.info(f"finished delete_many on '{self.name}'") return CollectionDeleteResult( deleted_count=deleted_count, raw_results=dm_responses, )
def delete_one(self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDeleteResult
-
Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
deleted one. See the
find
method for more on sorting. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionDeleteResult object summarizing the outcome of the delete operation.
Example
>>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_one({"seq": 1}) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0, 2] >>> my_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0] >>> my_coll.delete_one({"seq": 2}) CollectionDeleteResult(raw_results=..., deleted_count=0)
Expand source code
def delete_one( self, filter: FilterType, *, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDeleteResult: """ Delete one document matching a provided filter. This method never deletes more than a single document, regardless of the number of matches to the provided filters. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the deleted one. See the `find` method for more on sorting. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionDeleteResult object summarizing the outcome of the delete operation. Example: >>> my_coll.insert_many([{"seq": 1}, {"seq": 0}, {"seq": 2}]) CollectionInsertManyResult(...) >>> my_coll.delete_one({"seq": 1}) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0, 2] >>> my_coll.delete_one( ... {"seq": {"$exists": True}}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) CollectionDeleteResult(raw_results=..., deleted_count=1) >>> my_coll.distinct("seq") [0] >>> my_coll.delete_one({"seq": 2}) CollectionDeleteResult(raw_results=..., deleted_count=0) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = { "deleteOne": { k: v for k, v in { "filter": filter, "sort": sort, }.items() if v is not None } } logger.info(f"deleteOne on '{self.name}'") do_response = self._converted_request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if "deletedCount" in do_response.get("status", {}): deleted_count = do_response["status"]["deletedCount"] return CollectionDeleteResult( deleted_count=deleted_count, raw_results=[do_response], ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from delete_one API command.", raw_response=do_response, )
def distinct(self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[typing.Any]
-
Return a list of the unique values of
key
across the documents in the collection that match the provided filter.Args
key
- the name of the field whose value is inspected across documents.
Keys can use dot-notation to descend to deeper document levels.
Example of acceptable
key
values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
general_method_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation (which may involve multiple API requests).
This method, being based on
find
(see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms
- a timeout, in milliseconds, for each API request.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a list of all different values for
key
found across the documents that match the filter. The result list has no repeated items.Example
>>> my_coll.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['c5b99f37-...', 'd6416321-...']) >>> my_coll.distinct("name") ['Marco', 'Emma'] >>> my_coll.distinct("city") ['Helsinki'] >>> my_coll.distinct("food") ['apple', 'orange', {'likes_fruit': True, 'allergies': []}] >>> my_coll.distinct("food.1") ['orange'] >>> my_coll.distinct("food.allergies") [] >>> my_coll.distinct("food.likes_fruit") [True]
Note
It must be kept in mind that
distinct
is a client-side operation, which effectively browses all required documents using the logic of thefind
method and collects the unique values found forkey
. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large.Note
For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the
find
command.Expand source code
def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the documents in the collection that match the provided filter. Args: key: the name of the field whose value is inspected across documents. Keys can use dot-notation to descend to deeper document levels. Example of acceptable `key` values: "field" "field.subfield" "field.3" "field.3.subfield" If lists are encountered and no numeric index is specified, all items in the list are visited. filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved documents. request_timeout_ms: a timeout, in milliseconds, for each API request. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the documents that match the filter. The result list has no repeated items. Example: >>> my_coll.insert_many( ... [ ... {"name": "Marco", "food": ["apple", "orange"], "city": "Helsinki"}, ... {"name": "Emma", "food": {"likes_fruit": True, "allergies": []}}, ... ] ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['c5b99f37-...', 'd6416321-...']) >>> my_coll.distinct("name") ['Marco', 'Emma'] >>> my_coll.distinct("city") ['Helsinki'] >>> my_coll.distinct("food") ['apple', 'orange', {'likes_fruit': True, 'allergies': []}] >>> my_coll.distinct("food.1") ['orange'] >>> my_coll.distinct("food.allergies") [] >>> my_coll.distinct("food.likes_fruit") [True] Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required documents using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching documents is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the collection contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import CollectionFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: CollectionFindCursor[dict[str, Any], dict[str, Any]] = ( CollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items
def drop(self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop the collection, i.e. delete it from the database along with all the documents it contains.
Args
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Example
>>> my_coll.find_one({}) {'_id': '...', 'a': 100} >>> my_coll.drop() >>> my_coll.find_one({}) Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, collection name: my_collection
Note
Use with caution.
Note
Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further.
Expand source code
def drop( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the collection, i.e. delete it from the database along with all the documents it contains. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> my_coll.find_one({}) {'_id': '...', 'a': 100} >>> my_coll.drop() >>> my_coll.find_one({}) Traceback (most recent call last): ... ... astrapy.exceptions.DataAPIResponseException: Collection does not exist, collection name: my_collection Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual collection is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping collection '{self.name}' (self)") self.database.drop_collection( self.name, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping collection '{self.name}' (self)")
def estimated_document_count(self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Query the API server for an estimate of the document count in the collection.
Contrary to
count_documents
, this method has no filtering parameters.Args
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a server-provided estimate count of the documents in the collection.
Example
>>> my_coll.estimated_document_count() 35700
Expand source code
def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the collection. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the collection. Example: >>> my_coll.estimated_document_count() 35700 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = self._converted_request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, )
def find(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionFindCursor[DOC, DOC2]
-
Find documents on the collection, matching a certain provided filter.
The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the
find
method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of thesort
option in a non-vector-search manner.Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting cursor is implicitly a
CollectionFindCursor[DOC, DOC]
, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip
- with this integer parameter, what would be the first
skip
documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicitsort
criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit
- this (integer) parameter sets a limit over how many documents
are returned. Once
limit
is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in each
returned document. Can only be used for vector ANN search, i.e.
when either
vector
is supplied or thesort
parameter has the shape {"$vector": …}. include_sort_vector
- a boolean to request the search query vector.
If set to True (and if the invocation is a vector search), calling
the
get_sort_vector
method on the returned cursor will yield the vector used for the ANN search. sort
- with this dictionary parameter one can control the order
the documents are returned. See the Note about sorting, as well as
the one about upper bounds, for details.
Vector-based ANN sorting is achieved by providing a "$vector"
or a "$vectorize" key in
sort
. request_timeout_ms
- a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
request_timeout_ms
.
Returns
- a Cursor object representing iterations over the matching documents
- (see the Cursor object for how to use it. The simplest thing is to
run a for loop
for document in collection.sort(...):
).
Examples
>>> filter = {"seq": {"$exists": True}} >>> for doc in my_coll.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... 37 35 10 36 27 >>> cursor1 = my_coll.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) >>> [doc["_id"] for doc in cursor1] ['97e85f81-...', '1581efe4-...', '...', '...'] >>> cursor2 = my_coll.find({}, limit=3) >>> cursor2.distinct("seq") [37, 35, 10]
>>> my_coll.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) >>> ann_tags = [ ... document["tag"] ... for document in my_coll.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] >>> ann_tags ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE)
>>> cursor = my_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> cursor.get_sort_vector() [3.0, 3.0] >>> matches = list(cursor) >>> cursor.get_sort_vector() [3.0, 3.0]
Note
The following are example values for the
sort
parameter. When no particular order is required: sort={} # (default when parameter not provided) When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ acollections.OrderedDict
in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]}Note
Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as
.distinct()
on a cursor.Note
When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the
find
was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor.Expand source code
def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, document_type: type[DOC2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionFindCursor[DOC, DOC2]: """ Find documents on the collection, matching a certain provided filter. The method returns a Cursor that can then be iterated over. Depending on the method call pattern, the iteration over all documents can reflect collection mutations occurred since the `find` method was called, or not. In cases where the cursor reflects mutations in real-time, it will iterate over cursors in an approximate way (i.e. exhibiting occasional skipped or duplicate documents). This happens when making use of the `sort` option in a non-vector-search manner. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly a `CollectionFindCursor[DOC, DOC]`, i.e. maintains the same type for the items it returns as that for the documents in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: with this integer parameter, what would be the first `skip` documents returned by the query are discarded, and the results start from the (skip+1)-th document. This parameter can be used only in conjunction with an explicit `sort` criterion of the ascending/descending type (i.e. it cannot be used when not sorting, nor with vector-based ANN search). limit: this (integer) parameter sets a limit over how many documents are returned. Once `limit` is reached (or the cursor is exhausted for lack of matching documents), nothing more is returned. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. include_sort_vector: a boolean to request the search query vector. If set to True (and if the invocation is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting, as well as the one about upper bounds, for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. request_timeout_ms: a timeout, in milliseconds, for each single one of the underlying HTTP requests used to fetch documents as the cursor is iterated over. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `request_timeout_ms`. Returns: a Cursor object representing iterations over the matching documents (see the Cursor object for how to use it. The simplest thing is to run a for loop: `for document in collection.sort(...):`). Examples: >>> filter = {"seq": {"$exists": True}} >>> for doc in my_coll.find(filter, projection={"seq": True}, limit=5): ... print(doc["seq"]) ... 37 35 10 36 27 >>> cursor1 = my_coll.find( ... {}, ... limit=4, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) >>> [doc["_id"] for doc in cursor1] ['97e85f81-...', '1581efe4-...', '...', '...'] >>> cursor2 = my_coll.find({}, limit=3) >>> cursor2.distinct("seq") [37, 35, 10] >>> my_coll.insert_many([ ... {"tag": "A", "$vector": [4, 5]}, ... {"tag": "B", "$vector": [3, 4]}, ... {"tag": "C", "$vector": [3, 2]}, ... {"tag": "D", "$vector": [4, 1]}, ... {"tag": "E", "$vector": [2, 5]}, ... ]) >>> ann_tags = [ ... document["tag"] ... for document in my_coll.find( ... {}, ... sort={"$vector": [3, 3]}, ... limit=3, ... ) ... ] >>> ann_tags ['A', 'B', 'C'] >>> # (assuming the collection has metric VectorMetric.COSINE) >>> cursor = my_coll.find( ... sort={"$vector": [3, 3]}, ... limit=3, ... include_sort_vector=True, ... ) >>> cursor.get_sort_vector() [3.0, 3.0] >>> matches = list(cursor) >>> cursor.get_sort_vector() [3.0, 3.0] Note: The following are example values for the `sort` parameter. When no particular order is required: sort={} # (default when parameter not provided) When sorting by a certain value in ascending/descending order: sort={"field": SortMode.ASCENDING} sort={"field": SortMode.DESCENDING} When sorting first by "field" and then by "subfield" (while modern Python versions preserve the order of dictionaries, it is suggested for clarity to employ a `collections.OrderedDict` in these cases): sort={ "field": SortMode.ASCENDING, "subfield": SortMode.ASCENDING, } When running a vector similarity (ANN) search: sort={"$vector": [0.4, 0.15, -0.5]} Note: Some combinations of arguments impose an implicit upper bound on the number of documents that are returned by the Data API. More specifically: (a) Vector ANN searches cannot return more than a number of documents that at the time of writing is set to 1000 items. (b) When using a sort criterion of the ascending/descending type, the Data API will return a smaller number of documents, set to 20 at the time of writing, and stop there. The returned documents are the top results across the whole collection according to the requested criterion. These provisions should be kept in mind even when subsequently running a command such as `.distinct()` on a cursor. Note: When not specifying sorting criteria at all (by vector or otherwise), the cursor can scroll through an arbitrary number of documents as the Data API and the client periodically exchange new chunks of documents. It should be noted that the behavior of the cursor in the case documents have been added/removed after the `find` was started depends on database internals and it is not guaranteed, nor excluded, that such "real-time" changes in the data would be picked up by the cursor. """ # lazy-import here to avoid circular import issues from astrapy.cursors import CollectionFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( CollectionFindCursor( collection=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) )
def find_one(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Run a search, returning the first document in the collection that matches provided filters, if any is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in the
returned document. Can only be used for vector ANN search, i.e.
when either
vector
is supplied or thesort
parameter has the shape {"$vector": …}. sort
- with this dictionary parameter one can control the order
the documents are returned. See the Note about sorting for details.
Vector-based ANN sorting is achieved by providing a "$vector"
or a "$vectorize" key in
sort
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary expressing the required document, otherwise None.
Examples
>>> my_coll.find_one({}) {'_id': '68d1e515-...', 'seq': 37} >>> my_coll.find_one({"seq": 10}) {'_id': 'd560e217-...', 'seq': 10} >>> my_coll.find_one({"seq": 1011}) >>> # (returns None for no matches) >>> my_coll.find_one({}, projection={"seq": False}) {'_id': '68d1e515-...'} >>> my_coll.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) {'_id': '97e85f81-...', 'seq': 69} >>> my_coll.find_one({}, sort={"$vector": [1, 0]}, projection={"*": True}) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]}
Note
See the
find
method for more details on the accepted parameters (whereasskip
andlimit
are not valid parameters forfind_one
).Expand source code
def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Run a search, returning the first document in the collection that matches provided filters, if any is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned document. Can only be used for vector ANN search, i.e. when either `vector` is supplied or the `sort` parameter has the shape {"$vector": ...}. sort: with this dictionary parameter one can control the order the documents are returned. See the Note about sorting for details. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the required document, otherwise None. Examples: >>> my_coll.find_one({}) {'_id': '68d1e515-...', 'seq': 37} >>> my_coll.find_one({"seq": 10}) {'_id': 'd560e217-...', 'seq': 10} >>> my_coll.find_one({"seq": 1011}) >>> # (returns None for no matches) >>> my_coll.find_one({}, projection={"seq": False}) {'_id': '68d1e515-...'} >>> my_coll.find_one( ... {}, ... sort={"seq": astrapy.constants.SortMode.DESCENDING}, ... ) {'_id': '97e85f81-...', 'seq': 69} >>> my_coll.find_one({}, sort={"$vector": [1, 0]}, projection={"*": True}) {'_id': '...', 'tag': 'D', '$vector': [4.0, 1.0]} Note: See the `find` method for more details on the accepted parameters (whereas `skip` and `limit` are not valid parameters for `find_one`). """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findOne API command.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return fo_response["data"]["document"] # type: ignore[no-any-return]
def find_one_and_delete(self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document in the collection and delete it. The deleted document, however, is the return value of the method.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
deleted one. See the
find
method for more on sorting. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
Either the document (or a projection thereof, as requested), or None if no matches were found in the first place.
Example
>>> my_coll.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) CollectionInsertManyResult(...) >>> my_coll.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) {'_id': '5997fb48-...', 'species': 'swan'} >>> my_coll.find_one_and_delete({"species": {"$ne": "frog"}}) >>> # (returns None for no matches)
Expand source code
def find_one_and_delete( self, filter: FilterType, *, projection: ProjectionType | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document in the collection and delete it. The deleted document, however, is the return value of the method. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the deleted one. See the `find` method for more on sorting. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: Either the document (or a projection thereof, as requested), or None if no matches were found in the first place. Example: >>> my_coll.insert_many( ... [ ... {"species": "swan", "class": "Aves"}, ... {"species": "frog", "class": "Amphibia"}, ... ], ... ) CollectionInsertManyResult(...) >>> my_coll.find_one_and_delete( ... {"species": {"$ne": "frog"}}, ... projection=["species"], ... ) {'_id': '5997fb48-...', 'species': 'swan'} >>> my_coll.find_one_and_delete({"species": {"$ne": "frog"}}) >>> # (returns None for no matches) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _projection = normalize_optional_projection(projection) fo_payload = { "findOneAndDelete": { k: v for k, v in { "filter": filter, "sort": sort, "projection": _projection, }.items() if v is not None } } logger.info(f"findOneAndDelete on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndDelete on '{self.name}'") if "document" in fo_response.get("data", {}): document = fo_response["data"]["document"] return document # type: ignore[no-any-return] else: deleted_count = fo_response.get("status", {}).get("deletedCount") if deleted_count == 0: return None else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_delete API command.", raw_response=fo_response, )
def find_one_and_replace(self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = 'before', general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
replacement
- the new document to write into the collection.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True,
replacement
is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document
- a flag controlling what document is returned:
if set to
ReturnDocument.BEFORE
, or the string "before", the document found on database is returned; if set toReturnDocument.AFTER
, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the
return_document
parameter).Example
>>> my_coll.insert_one({"_id": "rule1", "text": "all animals are equal"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) {'_id': 'rule1', 'text': 'all animals are equal'} >>> my_coll.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'rule1', 'text': 'and the pigs are the rulers'} >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) {'text': 'F=ma'}
Expand source code
def find_one_and_replace( self, filter: FilterType, replacement: DOC, *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and replace it entirely with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no replacement was inserted (depending on the `return_document` parameter). Example: >>> my_coll.insert_one({"_id": "rule1", "text": "all animals are equal"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_replace( ... {"_id": "rule1"}, ... {"text": "some animals are more equal!"}, ... ) {'_id': 'rule1', 'text': 'all animals are equal'} >>> my_coll.find_one_and_replace( ... {"text": "some animals are more equal!"}, ... {"text": "and the pigs are the rulers"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'rule1', 'text': 'and the pigs are the rulers'} >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma^2"}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_replace( ... {"_id": "rule2"}, ... {"text": "F=ma"}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... projection={"_id": False}, ... ) {'text': 'F=ma'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, )
def find_one_and_update(self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = 'before', general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~DOC]
-
Find a document on the collection and update it as requested, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
projection
- it controls which parts of the document are returned.
It can be an allow-list:
{"f1": True, "f2": True}
, or a deny-list:{"fx": False, "fy": False}
, but not a mixture (except for the_id
and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections{"*": True}
and{"*": False}
have the effect of returning the whole document and{}
respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance,{"array": {"$slice": 2}}
,{"array": {"$slice": -2}}
,{"array": {"$slice": [4, 2]}}
or{"array": {"$slice": [-4, 2]}}
. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as$vector
or$vectorize
. See the Data API documentation for more on projections. sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True, a new document (resulting from applying the
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document
- a flag controlling what document is returned:
if set to
ReturnDocument.BEFORE
, or the string "before", the document found on database is returned; if set toReturnDocument.AFTER
, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the
return_document
parameter).Example
>>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) {'_id': 'a80106f2-...', 'Marco': 'Polo'} >>> my_coll.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'a80106f2-...', 'title': 'Mr.', 'rank': 3} >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'cb4ef2ab-...', 'name': 'Johnny', 'rank': 0}
Expand source code
def find_one_and_update( self, filter: FilterType, update: dict[str, Any], *, projection: ProjectionType | None = None, sort: SortType | None = None, upsert: bool = False, return_document: str = ReturnDocument.BEFORE, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> DOC | None: """ Find a document on the collection and update it as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. projection: it controls which parts of the document are returned. It can be an allow-list: `{"f1": True, "f2": True}`, or a deny-list: `{"fx": False, "fy": False}`, but not a mixture (except for the `_id` and other special fields, which can be associated to both True or False independently of the rest of the specification). The special star-projections `{"*": True}` and `{"*": False}` have the effect of returning the whole document and `{}` respectively. For lists in documents, slice directives can be passed to select portions of the list: for instance, `{"array": {"$slice": 2}}`, `{"array": {"$slice": -2}}`, `{"array": {"$slice": [4, 2]}}` or `{"array": {"$slice": [-4, 2]}}`. An iterable over strings will be treated implicitly as an allow-list. The default projection (used if this parameter is not passed) does not necessarily include "special" fields such as `$vector` or `$vectorize`. See the Data API documentation for more on projections. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. return_document: a flag controlling what document is returned: if set to `ReturnDocument.BEFORE`, or the string "before", the document found on database is returned; if set to `ReturnDocument.AFTER`, or the string "after", the new document is returned. The default is "before". general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: A document (or a projection thereof, as required), either the one before the replace operation or the one after that. Alternatively, the method returns None to represent that no matching document was found, or that no update was applied (depending on the `return_document` parameter). Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.find_one_and_update( ... {"Marco": {"$exists": True}}, ... {"$set": {"title": "Mr."}}, ... ) {'_id': 'a80106f2-...', 'Marco': 'Polo'} >>> my_coll.find_one_and_update( ... {"title": "Mr."}, ... {"$inc": {"rank": 3}}, ... projection=["title", "rank"], ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'a80106f2-...', 'title': 'Mr.', 'rank': 3} >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) >>> # (returns None for no matches) >>> my_coll.find_one_and_update( ... {"name": "Johnny"}, ... {"$set": {"rank": 0}}, ... upsert=True, ... return_document=astrapy.constants.ReturnDocument.AFTER, ... ) {'_id': 'cb4ef2ab-...', 'name': 'Johnny', 'rank': 0} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "returnDocument": return_document, "upsert": upsert, } fo_payload = { "findOneAndUpdate": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, "projection": normalize_optional_projection(projection), }.items() if v is not None } } logger.info(f"findOneAndUpdate on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndUpdate on '{self.name}'") if "document" in fo_response.get("data", {}): ret_document = fo_response.get("data", {}).get("document") if ret_document is None: return None else: return ret_document # type: ignore[no-any-return] else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_update API command.", raw_response=fo_response, )
def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInfo
-
Information on the collection (name, location, database), in the form of a CollectionInfo object.
Not to be confused with the collection
options
method (related to the collection internal configuration).Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Example
>>> my_coll.info().database_info.region 'eu-west-1' >>> my_coll.info().full_name 'default_keyspace.my_v_collection'
Note
the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for
Database.info()
for more details.Expand source code
def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInfo: """ Information on the collection (name, location, database), in the form of a CollectionInfo object. Not to be confused with the collection `options` method (related to the collection internal configuration). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> my_coll.info().database_info.region 'eu-west-1' >>> my_coll.info().full_name 'default_keyspace.my_v_collection' Note: the returned CollectionInfo wraps, among other things, the database information: as such, calling this method triggers the same-named method of a Database object (which, in turn, performs a HTTP request to the DevOps API). See the documentation for `Database.info()` for more details. """ return CollectionInfo( database_info=self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ), keyspace=self.keyspace, name=self.name, full_name=self.full_name, )
def insert_many(self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInsertManyResult
-
Insert a list of documents into the collection. This is not an atomic operation.
Args
documents
- an iterable of dictionaries, each a document to insert.
Documents may specify their
_id
field or leave it out, in which case it will be added automatically. ordered
- if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster.
chunk_size
- how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default.
concurrency
- maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions.
general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionInsertManyResult object.
Examples
>>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_many( ... [{"a": 10}, {"a": 5}, {"b": [True, False, False]}], ... ordered=True, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['184bb06f-...', '...', '...']) >>> my_coll.count_documents({}, upper_bound=100) 3 >>> my_coll.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=[... ...]) >>> my_coll.count_documents({}, upper_bound=100) 53 >>> my_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... ) CollectionInsertManyResult(...)
Note
Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important.
Note
A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an
_id
already present on the collection, or its vector dimension may not match the collection setting.For an ordered insertion, the method will raise an exception at the first such faulty document – nevertheless, all documents processed until then will end up being written to the database.
For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised – and all insertable documents will have been written to the database, including those "after" the troublesome ones.
If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole
insert_many
operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database.Expand source code
def insert_many( self, documents: Iterable[DOC], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, request_timeout_ms: int | None = None, general_method_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertManyResult: """ Insert a list of documents into the collection. This is not an atomic operation. Args: documents: an iterable of dictionaries, each a document to insert. Documents may specify their `_id` field or leave it out, in which case it will be added automatically. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions are to be preferred as they complete much faster. chunk_size: how many documents to include in a single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertManyResult object. Examples: >>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_many( ... [{"a": 10}, {"a": 5}, {"b": [True, False, False]}], ... ordered=True, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=['184bb06f-...', '...', '...']) >>> my_coll.count_documents({}, upper_bound=100) 3 >>> my_coll.insert_many( ... [{"seq": i} for i in range(50)], ... concurrency=5, ... ) CollectionInsertManyResult(raw_results=..., inserted_ids=[... ...]) >>> my_coll.count_documents({}, upper_bound=100) 53 >>> my_coll.insert_many( ... [ ... {"tag": "a", "$vector": [1, 2]}, ... {"tag": "b", "$vector": [3, 4]}, ... ] ... ) CollectionInsertManyResult(...) Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the document sequence is important. Note: A failure mode for this command is related to certain faulty documents found among those to insert: a document may have the an `_id` already present on the collection, or its vector dimension may not match the collection setting. For an ordered insertion, the method will raise an exception at the first such faulty document -- nevertheless, all documents processed until then will end up being written to the database. For unordered insertions, if the error stems from faulty documents the insertion proceeds until exhausting the input documents: then, an exception is raised -- and all insertable documents will have been written to the database, including those "after" the troublesome ones. If, on the other hand, there are errors not related to individual documents (such as a network connectivity error), the whole `insert_many` operation will stop in mid-way, an exception will be raised, and only a certain amount of the input documents will have made their way to the database. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _documents = list(documents) logger.info(f"inserting {len(_documents)} documents in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") chunk_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") # accumulate the results in this call chunk_inserted_ids = (chunk_response.get("status") or {}).get( "insertedIds", [] ) inserted_ids += chunk_inserted_ids raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} if _concurrency > 1: with ThreadPoolExecutor(max_workers=_concurrency) as executor: def _chunk_insertor( document_chunk: list[dict[str, Any]], ) -> dict[str, Any]: im_payload = { "insertMany": { "documents": document_chunk, "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response raw_results = list( executor.map( _chunk_insertor, ( _documents[i : i + _chunk_size] for i in range(0, len(_documents), _chunk_size) ), ) ) else: for i in range(0, len(_documents), _chunk_size): im_payload = { "insertMany": { "documents": _documents[i : i + _chunk_size], "options": options, }, } logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._converted_request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") raw_results.append(im_response) # recast raw_results inserted_ids = [ inserted_id for chunk_response in raw_results for inserted_id in (chunk_response.get("status") or {}).get( "insertedIds", [] ) ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) raise CollectionInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = CollectionInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, ) logger.info( f"finished inserting {len(_documents)} documents in '{self.name}'" ) return full_result
def insert_one(self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionInsertOneResult
-
Insert a single document in the collection in an atomic operation.
Args
document
- the dictionary expressing the document to insert.
The
_id
field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionInsertOneResult object.
Examples
>>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) CollectionInsertOneResult(raw_results=..., inserted_id='ed4587a4-...-...-...') >>> my_coll.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) CollectionInsertOneResult(raw_results=..., inserted_id='user-123') >>> my_coll.count_documents({}, upper_bound=10) 2
>>> my_coll.insert_one({"tag": "v", "$vector": [10, 11]}) CollectionInsertOneResult(...)
Note
If an
_id
is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails.Expand source code
def insert_one( self, document: DOC, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionInsertOneResult: """ Insert a single document in the collection in an atomic operation. Args: document: the dictionary expressing the document to insert. The `_id` field of the document can be left out, in which case it will be created automatically. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionInsertOneResult object. Examples: >>> my_coll.count_documents({}, upper_bound=10) 0 >>> my_coll.insert_one( ... { ... "age": 30, ... "name": "Smith", ... "food": ["pear", "peach"], ... "likes_fruit": True, ... }, ... ) CollectionInsertOneResult(raw_results=..., inserted_id='ed4587a4-...-...-...') >>> my_coll.insert_one({"_id": "user-123", "age": 50, "name": "Maccio"}) CollectionInsertOneResult(raw_results=..., inserted_id='user-123') >>> my_coll.count_documents({}, upper_bound=10) 2 >>> my_coll.insert_one({"tag": "v", "$vector": [10, 11]}) CollectionInsertOneResult(...) Note: If an `_id` is explicitly provided, which corresponds to a document that exists already in the collection, an error is raised and the insertion fails. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = {"insertOne": {"document": document}} logger.info(f"insertOne on '{self.name}'") io_response = self._converted_request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if io_response["status"]["insertedIds"]: inserted_id = io_response["status"]["insertedIds"][0] return CollectionInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insert_one API command.", raw_response=io_response, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from insert_one API command.", raw_response=io_response, )
def options(self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionDefinition
-
Get the collection options, i.e. its configuration as read from the database.
The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application.
Args
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a CollectionDefinition instance describing the collection. (See also the database
list_collections
method.)Example
>>> my_coll.options() CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine'))
Expand source code
def options( self, *, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionDefinition: """ Get the collection options, i.e. its configuration as read from the database. The method issues a request to the Data API each time is invoked, without caching mechanisms: this ensures up-to-date information for usages such as real-time collection validation by the application. Args: collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a CollectionDefinition instance describing the collection. (See also the database `list_collections` method.) Example: >>> my_coll.options() CollectionDefinition(vector=CollectionVectorOptions(dimension=3, metric='cosine')) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting collections in search of '{self.name}'") self_descriptors = [ coll_desc for coll_desc in self.database._list_collections_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label, ), ) if coll_desc.name == self.name ] logger.info(f"finished getting collections in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Collection {self.keyspace}.{self.name} not found.", )
def replace_one(self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Replace a single document on the collection with a new one, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
replacement
- the new document to write into the collection.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True,
replacement
is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the replace operation.
Example
>>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.replace_one({"Marco": {"$exists": True}}, {"Buda": "Pest"}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.find_one({"Buda": "Pest"}) {'_id': '8424905a-...', 'Buda': 'Pest'} >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '931b47d6-...'})
Expand source code
def replace_one( self, filter: FilterType, replacement: DOC, *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Replace a single document on the collection with a new one, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. replacement: the new document to write into the collection. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, `replacement` is inserted as a new document if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the replace operation. Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.replace_one({"Marco": {"$exists": True}}, {"Buda": "Pest"}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.find_one({"Buda": "Pest"}) {'_id': '8424905a-...', 'Buda': 'Pest'} >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.replace_one({"Mirco": {"$exists": True}}, {"Oh": "yeah?"}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '931b47d6-...'}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } fo_payload = { "findOneAndReplace": { k: v for k, v in { "filter": filter, "replacement": replacement, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"findOneAndReplace on '{self.name}'") fo_response = self._converted_request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished findOneAndReplace on '{self.name}'") if "document" in fo_response.get("data", {}): fo_status = fo_response.get("status") or {} _update_info = _prepare_update_info([fo_status]) return CollectionUpdateResult( raw_results=[fo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from find_one_and_replace API command.", raw_response=fo_response, )
def to_async(self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncCollection[DOC]
-
Create an AsyncCollection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into an async object).
Args
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, an AsyncCollection instance.
Example
>>> asyncio.run(my_coll.to_async().count_documents({},upper_bound=100)) 77
Expand source code
def to_async( self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncCollection[DOC]: """ Create an AsyncCollection from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this collection in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an AsyncCollection instance. Example: >>> asyncio.run(my_coll.to_async().count_documents({},upper_bound=100)) 77 """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncCollection( database=self.database.to_async(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, )
def update_many(self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
upsert
- this parameter controls the behavior in absence of matches.
If True, a single new document (resulting from applying
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead.
request_timeout_ms
- a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the update operation.
Example
>>> my_coll.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) CollectionInsertManyResult(...) >>> my_coll.update_many({"c": {"$ne": "green"}}, {"$set": {"nongreen": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2}) >>> my_coll.update_many({"c": "orange"}, {"$set": {"is_also_fruit": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '46643050-...'})
Note
Similarly to the case of
find
(see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of theupdate_many
can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not.Expand source code
def update_many( self, filter: FilterType, update: dict[str, Any], *, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Apply an update operation to all documents matching a condition, optionally inserting one documents in absence of matches. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the documents, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. upsert: this parameter controls the behavior in absence of matches. If True, a single new document (resulting from applying `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method may entail successive HTTP API requests, depending on the amount of involved documents. If not passed, the collection-level setting is used instead. request_timeout_ms: a timeout, in milliseconds, for each API request. If not passed, the collection-level setting is used instead. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> my_coll.insert_many([{"c": "red"}, {"c": "green"}, {"c": "blue"}]) CollectionInsertManyResult(...) >>> my_coll.update_many({"c": {"$ne": "green"}}, {"$set": {"nongreen": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 2, 'updatedExisting': True, 'ok': 1.0, 'nModified': 2}) >>> my_coll.update_many({"c": "orange"}, {"$set": {"is_also_fruit": True}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_many( ... {"c": "orange"}, ... {"$set": {"is_also_fruit": True}}, ... upsert=True, ... ) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '46643050-...'}) Note: Similarly to the case of `find` (see its docstring for more details), running this command while, at the same time, another process is inserting new documents which match the filter of the `update_many` can result in an unpredictable fraction of these documents being updated. In other words, it cannot be easily predicted whether a given newly-inserted document will be picked up by the update_many command or not. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) api_options = { "upsert": upsert, } page_state_options: dict[str, str] = {} um_responses: list[dict[str, Any]] = [] um_statuses: list[dict[str, Any]] = [] must_proceed = True logger.info(f"starting update_many on '{self.name}'") timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) while must_proceed: options = {**api_options, **page_state_options} this_um_payload = { "updateMany": { k: v for k, v in { "filter": filter, "update": update, "options": options, }.items() if v is not None } } logger.info(f"updateMany on '{self.name}'") this_um_response = self._converted_request( payload=this_um_payload, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished updateMany on '{self.name}'") this_um_status = this_um_response.get("status") or {} # # if errors, quit early if this_um_response.get("errors", []): partial_update_info = _prepare_update_info(um_statuses) partial_result = CollectionUpdateResult( raw_results=um_responses, update_info=partial_update_info, ) all_um_responses = um_responses + [this_um_response] raise CollectionUpdateManyException.from_responses( commands=[None for _ in all_um_responses], raw_responses=all_um_responses, partial_result=partial_result, ) else: if "status" not in this_um_response: raise UnexpectedDataAPIResponseException( text="Faulty response from update_many API command.", raw_response=this_um_response, ) um_responses.append(this_um_response) um_statuses.append(this_um_status) next_page_state = this_um_status.get("nextPageState") if next_page_state is not None: must_proceed = True page_state_options = {"pageState": next_page_state} else: must_proceed = False page_state_options = {} update_info = _prepare_update_info(um_statuses) logger.info(f"finished update_many on '{self.name}'") return CollectionUpdateResult( raw_results=um_responses, update_info=update_info, )
def update_one(self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> CollectionUpdateResult
-
Update a single document on the collection as requested, optionally inserting a new one if no match is found.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators.
update
- the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax.
sort
- with this dictionary parameter one can control the sorting
order of the documents matching the filter, effectively
determining what document will come first and hence be the
replaced one. See the
find
method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key insort
. upsert
- this parameter controls the behavior in absence of matches.
If True, a new document (resulting from applying the
update
to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a CollectionUpdateResult object summarizing the outcome of the update operation.
Example
>>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.update_one({"Marco": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '2a45ff60-...'})
Expand source code
def update_one( self, filter: FilterType, update: dict[str, Any], *, sort: SortType | None = None, upsert: bool = False, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> CollectionUpdateResult: """ Update a single document on the collection as requested, optionally inserting a new one if no match is found. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"price": {"$lt": 100}} {"$and": [{"name": "John"}, {"price": {"$lt": 100}}]} See the Data API documentation for the full set of operators. update: the update prescription to apply to the document, expressed as a dictionary as per Data API syntax. Examples are: {"$set": {"field": "value}} {"$inc": {"counter": 10}} {"$unset": {"field": ""}} See the Data API documentation for the full syntax. sort: with this dictionary parameter one can control the sorting order of the documents matching the filter, effectively determining what document will come first and hence be the replaced one. See the `find` method for more on sorting. Vector-based ANN sorting is achieved by providing a "$vector" or a "$vectorize" key in `sort`. upsert: this parameter controls the behavior in absence of matches. If True, a new document (resulting from applying the `update` to an empty document) is inserted if no matches are found on the collection. If False, the operation silently does nothing in case of no matches. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a CollectionUpdateResult object summarizing the outcome of the update operation. Example: >>> my_coll.insert_one({"Marco": "Polo"}) CollectionInsertOneResult(...) >>> my_coll.update_one({"Marco": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': True, 'ok': 1.0, 'nModified': 1}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}) CollectionUpdateResult(raw_results=..., update_info={'n': 0, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0}) >>> my_coll.update_one({"Mirko": {"$exists": True}}, {"$inc": {"rank": 3}}, upsert=True) CollectionUpdateResult(raw_results=..., update_info={'n': 1, 'updatedExisting': False, 'ok': 1.0, 'nModified': 0, 'upserted': '2a45ff60-...'}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { "upsert": upsert, } uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": update, "options": options, "sort": sort, }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = self._converted_request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: uo_status = uo_response["status"] _update_info = _prepare_update_info([uo_status]) return CollectionUpdateResult( raw_results=[uo_response], update_info=_update_info, ) else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, )
def with_options(self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Collection[DOC]
-
Create a clone of this collection with some changed attributes.
Args
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new Collection instance.
Example
>>> collection_with_api_key_configured = my_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... )
Expand source code
def with_options( self: Collection[DOC], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Create a clone of this collection with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new Collection instance. Example: >>> collection_with_api_key_configured = my_collection.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, )
class DataAPIClient (token: str | TokenProvider | UnsetType = (unset), *, environment: str | UnsetType = (unset), callers: Sequence[CallerType] | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset))
-
A client for using the Data API. This is the entry point, sitting at the top of the conceptual "client -> database -> collection" hierarchy and of the "client -> admin -> database admin" chain as well.
A client is created first, optionally passing it a suitable Access Token. Starting from the client, then: - databases (Database and AsyncDatabase) are created for working with data - AstraDBAdmin objects can be created for admin-level work
Args
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. Note that generally one should pass the token later, when spawning Database instances from the client (with theget_database
) method of DataAPIClient; the reason is that the typical tokens are scoped to a single database. However, when performing administrative tasks at the AstraDBAdmin level (such as creating databases), an org-wide token is required – then it makes sense to provide it when creating the DataAPIClient instance. environment
- a string representing the target Data API environment.
It can be left unspecified for the default value of
Environment.PROD
; other values includeEnvironment.OTHER
,Environment.DSE
. callers
- a list of caller identities, i.e. applications, or frameworks, on behalf of which Data API and DevOps API calls are performed. These end up in the request user-agent. Each caller identity is a ("caller_name", "caller_version") pair.
api_options
- a specification - complete or partial - of the API Options to override the system defaults. This allows for a deeper configuration than what the named parameters (token, environment, callers) offer. If this is passed alongside these named parameters, those will take precedence.
Example
>>> from astrapy import DataAPIClient >>> from astrapy.info import CollectionDefinition >>> my_client = DataAPIClient() >>> my_db0 = my_client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... ) >>> my_coll = my_db0.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ), ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.1, 0.3]}) >>> my_db1 = my_client.get_database("01234567-...") >>> my_db2 = my_client.get_database("01234567-...", region="us-east1") >>> my_adm0 = my_client.get_admin() >>> my_adm1 = my_client.get_admin(token=more_powerful_token_override) >>> database_list = my_adm0.list_databases()
Expand source code
class DataAPIClient: """ A client for using the Data API. This is the entry point, sitting at the top of the conceptual "client -> database -> collection" hierarchy and of the "client -> admin -> database admin" chain as well. A client is created first, optionally passing it a suitable Access Token. Starting from the client, then: - databases (Database and AsyncDatabase) are created for working with data - AstraDBAdmin objects can be created for admin-level work Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. Note that generally one should pass the token later, when spawning Database instances from the client (with the `get_database`) method of DataAPIClient; the reason is that the typical tokens are scoped to a single database. However, when performing administrative tasks at the AstraDBAdmin level (such as creating databases), an org-wide token is required -- then it makes sense to provide it when creating the DataAPIClient instance. environment: a string representing the target Data API environment. It can be left unspecified for the default value of `Environment.PROD`; other values include `Environment.OTHER`, `Environment.DSE`. callers: a list of caller identities, i.e. applications, or frameworks, on behalf of which Data API and DevOps API calls are performed. These end up in the request user-agent. Each caller identity is a ("caller_name", "caller_version") pair. api_options: a specification - complete or partial - of the API Options to override the system defaults. This allows for a deeper configuration than what the named parameters (token, environment, callers) offer. If this is passed alongside these named parameters, those will take precedence. Example: >>> from astrapy import DataAPIClient >>> from astrapy.info import CollectionDefinition >>> my_client = DataAPIClient() >>> my_db0 = my_client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... ) >>> my_coll = my_db0.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ), ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.1, 0.3]}) >>> my_db1 = my_client.get_database("01234567-...") >>> my_db2 = my_client.get_database("01234567-...", region="us-east1") >>> my_adm0 = my_client.get_admin() >>> my_adm1 = my_client.get_admin(token=more_powerful_token_override) >>> database_list = my_adm0.list_databases() """ def __init__( self, token: str | TokenProvider | UnsetType = _UNSET, *, environment: str | UnsetType = _UNSET, callers: Sequence[CallerType] | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> None: # this parameter bootstraps the defaults, has a special treatment: _environment: str if isinstance(environment, UnsetType): _environment = Environment.PROD.lower() else: _environment = environment.lower() if _environment not in Environment.values: raise InvalidEnvironmentException( f"Unsupported `environment` value: '{_environment}'." ) arg_api_options = APIOptions( callers=callers, token=token, ) self.api_options = ( defaultAPIOptions(_environment) .with_override(api_options) .with_override(arg_api_options) ) def __repr__(self) -> str: return f"{self.__class__.__name__}({self.api_options})" def __eq__(self, other: Any) -> bool: if isinstance(other, DataAPIClient): return all( [ self.api_options.token == other.api_options.token, self.api_options.environment == other.api_options.environment, self.api_options.callers == other.api_options.callers, ] ) else: return False def __getitem__(self, api_endpoint: str) -> Database: return self.get_database(api_endpoint=api_endpoint) def _copy( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIClient: arg_api_options = APIOptions(token=token) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return DataAPIClient( token=token, environment=final_api_options.environment, api_options=final_api_options, ) def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIClient: """ Create a clone of this DataAPIClient with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new DataAPIClient instance. Example: >>> other_auth_client = my_client.with_options( ... token="AstraCS:xyz...", ... ) """ return self._copy( token=token, api_options=api_options, ) def get_database( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Get a Database object from this client, for doing data-related work. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: a Database object with which to work on Data API collections. Example: >>> my_db1 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) >>> my_db2 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... token="AstraCS:...", ... keyspace="prod_keyspace", ... ) >>> my_coll = my_db0.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ), ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method of class AstraDBAdmin. """ # lazy importing here to avoid circular dependency from astrapy import Database arg_api_options = APIOptions(token=token) resulting_api_options = self.api_options.with_override( spawn_api_options ).with_override(arg_api_options) if resulting_api_options.environment in Environment.astra_db_values: parsed_api_endpoint = parse_api_endpoint(api_endpoint) if parsed_api_endpoint is not None: if parsed_api_endpoint.environment != resulting_api_options.environment: raise InvalidEnvironmentException( "Environment mismatch between client and provided " "API endpoint. You can try adding " f'`environment="{parsed_api_endpoint.environment}"` ' "to the DataAPIClient creation statement." ) return Database( api_endpoint=api_endpoint, keyspace=keyspace, api_options=resulting_api_options, ) else: msg = api_endpoint_parsing_error_message(api_endpoint) raise ValueError(msg) else: parsed_generic_api_endpoint = parse_generic_api_url(api_endpoint) if parsed_generic_api_endpoint: return Database( api_endpoint=parsed_generic_api_endpoint, keyspace=keyspace, api_options=resulting_api_options, ) else: msg = generic_api_url_parsing_error_message(api_endpoint) raise ValueError(msg) def get_async_database( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Get an AsyncDatabase object from this client, for doing data-related work. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: an AsyncDatabase object with which to work on Data API collections. Example: >>> async def create_use_db(cl: DataAPIClient, api_ep: str) -> None: ... async_db = cl.get_async_database(api_ep) ... my_a_coll = await async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await my_a_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) ... >>> asyncio.run( ... create_use_db( ... my_client, ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) ... ) Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method of class AstraDBAdmin. """ return self.get_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ).to_async() def get_database_by_api_endpoint( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Get a Database object from this client, for doing data-related work. Note: this is an alias for `get_database` (see). Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: a Database object with which to work on Data API collections. """ return self.get_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ) def get_async_database_by_api_endpoint( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Get an AsyncDatabase object from this client, for doing data-related work. Note: this is an alias for `get_async_database` (see). Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: an AsyncDatabase object with which to work on Data API collections. """ return self.get_async_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ) def get_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBAdmin: """ Get an AstraDBAdmin instance corresponding to this client, for admin work such as managing databases. Args: token: if supplied, is passed to the Astra DB Admin instead of the client token. This may be useful when switching to a more powerful, admin-capable permission set. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: An AstraDBAdmin instance, wich which to perform management at the database level. Example: >>> my_adm0 = my_client.get_admin() >>> my_adm1 = my_client.get_admin(token=more_powerful_token_override) >>> database_list = my_adm0.list_databases() >>> my_db_admin = my_adm0.create_database( ... "the_other_database", ... cloud_provider="AWS", ... region="eu-west-1", ... ) >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] """ # lazy importing here to avoid circular dependency from astrapy.admin import AstraDBAdmin arg_api_options = APIOptions(token=token) resulting_api_options = self.api_options.with_override( spawn_api_options ).with_override(arg_api_options) if resulting_api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Method not supported outside of Astra DB." ) return AstraDBAdmin(api_options=resulting_api_options)
Methods
def get_admin(self, *, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AstraDBAdmin
-
Get an AstraDBAdmin instance corresponding to this client, for admin work such as managing databases.
Args
token
- if supplied, is passed to the Astra DB Admin instead of the
client token. This may be useful when switching to a more powerful,
admin-capable permission set.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
An AstraDBAdmin instance, wich which to perform management at the database level.
Example
>>> my_adm0 = my_client.get_admin() >>> my_adm1 = my_client.get_admin(token=more_powerful_token_override) >>> database_list = my_adm0.list_databases() >>> my_db_admin = my_adm0.create_database( ... "the_other_database", ... cloud_provider="AWS", ... region="eu-west-1", ... ) >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one']
Expand source code
def get_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AstraDBAdmin: """ Get an AstraDBAdmin instance corresponding to this client, for admin work such as managing databases. Args: token: if supplied, is passed to the Astra DB Admin instead of the client token. This may be useful when switching to a more powerful, admin-capable permission set. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: An AstraDBAdmin instance, wich which to perform management at the database level. Example: >>> my_adm0 = my_client.get_admin() >>> my_adm1 = my_client.get_admin(token=more_powerful_token_override) >>> database_list = my_adm0.list_databases() >>> my_db_admin = my_adm0.create_database( ... "the_other_database", ... cloud_provider="AWS", ... region="eu-west-1", ... ) >>> my_db_admin.list_keyspaces() ['default_keyspace', 'that_other_one'] """ # lazy importing here to avoid circular dependency from astrapy.admin import AstraDBAdmin arg_api_options = APIOptions(token=token) resulting_api_options = self.api_options.with_override( spawn_api_options ).with_override(arg_api_options) if resulting_api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Method not supported outside of Astra DB." ) return AstraDBAdmin(api_options=resulting_api_options)
def get_async_database(self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = (unset), keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Get an AsyncDatabase object from this client, for doing data-related work.
Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token
- if supplied, is passed to the Database instead of the client token.
This can be either a literal token string or a subclass of
TokenProvider
. keyspace
- if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default.
spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
an AsyncDatabase object with which to work on Data API collections.
Example
>>> async def create_use_db(cl: DataAPIClient, api_ep: str) -> None: ... async_db = cl.get_async_database(api_ep) ... my_a_coll = await async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await my_a_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) ... >>> asyncio.run( ... create_use_db( ... my_client, ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) ... )
Note
This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the
create_database
method of class AstraDBAdmin.Expand source code
def get_async_database( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Get an AsyncDatabase object from this client, for doing data-related work. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: an AsyncDatabase object with which to work on Data API collections. Example: >>> async def create_use_db(cl: DataAPIClient, api_ep: str) -> None: ... async_db = cl.get_async_database(api_ep) ... my_a_coll = await async_db.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ) ... ) ... await my_a_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) ... >>> asyncio.run( ... create_use_db( ... my_client, ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) ... ) Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method of class AstraDBAdmin. """ return self.get_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ).to_async()
def get_async_database_by_api_endpoint(self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = (unset), keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Get an AsyncDatabase object from this client, for doing data-related work.
Note: this is an alias for
get_async_database
(see).Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token
- if supplied, is passed to the Database instead of the client token.
This can be either a literal token string or a subclass of
TokenProvider
. keyspace
- if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default.
spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
an AsyncDatabase object with which to work on Data API collections.
Expand source code
def get_async_database_by_api_endpoint( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Get an AsyncDatabase object from this client, for doing data-related work. Note: this is an alias for `get_async_database` (see). Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: an AsyncDatabase object with which to work on Data API collections. """ return self.get_async_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, )
def get_database(self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = (unset), keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Get a Database object from this client, for doing data-related work.
Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token
- if supplied, is passed to the Database instead of the client token.
This can be either a literal token string or a subclass of
TokenProvider
. keyspace
- if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default.
spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
a Database object with which to work on Data API collections.
Example
>>> my_db1 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) >>> my_db2 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... token="AstraCS:...", ... keyspace="prod_keyspace", ... ) >>> my_coll = my_db0.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ), ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]})
Note
This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the
create_database
method of class AstraDBAdmin.Expand source code
def get_database( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Get a Database object from this client, for doing data-related work. Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: a Database object with which to work on Data API collections. Example: >>> my_db1 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... ) >>> my_db2 = my_client.get_database( ... "https://01234567-...us-west1.apps.astra.datastax.com", ... token="AstraCS:...", ... keyspace="prod_keyspace", ... ) >>> my_coll = my_db0.create_collection( ... "movies", ... definition=( ... CollectionDefinition.builder() ... .set_vector_dimension(2) ... .build() ... ), ... ) >>> my_coll.insert_one({"title": "The Title", "$vector": [0.3, 0.4]}) Note: This method does not perform any admin-level operation through the DevOps API. For actual creation of a database, see the `create_database` method of class AstraDBAdmin. """ # lazy importing here to avoid circular dependency from astrapy import Database arg_api_options = APIOptions(token=token) resulting_api_options = self.api_options.with_override( spawn_api_options ).with_override(arg_api_options) if resulting_api_options.environment in Environment.astra_db_values: parsed_api_endpoint = parse_api_endpoint(api_endpoint) if parsed_api_endpoint is not None: if parsed_api_endpoint.environment != resulting_api_options.environment: raise InvalidEnvironmentException( "Environment mismatch between client and provided " "API endpoint. You can try adding " f'`environment="{parsed_api_endpoint.environment}"` ' "to the DataAPIClient creation statement." ) return Database( api_endpoint=api_endpoint, keyspace=keyspace, api_options=resulting_api_options, ) else: msg = api_endpoint_parsing_error_message(api_endpoint) raise ValueError(msg) else: parsed_generic_api_endpoint = parse_generic_api_url(api_endpoint) if parsed_generic_api_endpoint: return Database( api_endpoint=parsed_generic_api_endpoint, keyspace=keyspace, api_options=resulting_api_options, ) else: msg = generic_api_url_parsing_error_message(api_endpoint) raise ValueError(msg)
def get_database_by_api_endpoint(self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = (unset), keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Get a Database object from this client, for doing data-related work.
Note: this is an alias for
get_database
(see).Args
api_endpoint
- the API Endpoint for the target database
(e.g.
https://<ID>-<REGION>.apps.astra.datastax.com
). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token
- if supplied, is passed to the Database instead of the client token.
This can be either a literal token string or a subclass of
TokenProvider
. keyspace
- if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default.
spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
a Database object with which to work on Data API collections.
Expand source code
def get_database_by_api_endpoint( self, api_endpoint: str, *, token: str | TokenProvider | UnsetType = _UNSET, keyspace: str | None = None, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Get a Database object from this client, for doing data-related work. Note: this is an alias for `get_database` (see). Args: api_endpoint: the API Endpoint for the target database (e.g. `https://<ID>-<REGION>.apps.astra.datastax.com`). The database must exist already for the resulting object to be effectively used; in other words, this invocation does not create the database, just the object instance. Actual admin work can be achieved by using the AstraDBAdmin object. token: if supplied, is passed to the Database instead of the client token. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. keyspace: if provided, it is passed to the Database; otherwise the Database class will apply an environment-specific default. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: a Database object with which to work on Data API collections. """ return self.get_database( api_endpoint=api_endpoint, token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, )
def with_options(self, *, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> DataAPIClient
-
Create a clone of this DataAPIClient with some changed attributes.
Args
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new DataAPIClient instance.
Example
>>> other_auth_client = my_client.with_options( ... token="AstraCS:xyz...", ... )
Expand source code
def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIClient: """ Create a clone of this DataAPIClient with some changed attributes. Args: token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new DataAPIClient instance. Example: >>> other_auth_client = my_client.with_options( ... token="AstraCS:xyz...", ... ) """ return self._copy( token=token, api_options=api_options, )
class DataAPIDatabaseAdmin (*, api_endpoint: str, api_options: FullAPIOptions, spawner_database: Database | AsyncDatabase | None = None)
-
An "admin" object for non-Astra Data API environments, to perform administrative tasks at the keyspaces level such as creating/listing/dropping keyspaces.
Conforming to the architecture of non-Astra deployments of the Data API, this object works within the one existing database. It is within that database that the keyspace CRUD operations (and possibly other admin operations) are performed. Since non-Astra environment lack the concept of an overall admin (such as the all-databases AstraDBAdmin class), a
DataAPIDatabaseAdmin
is generally created by invoking theget_database_admin
method of the correspondingDatabase
object (which in turn is spawned by a DataAPIClient).Args
api_endpoint
- the full URI to access the Data API, e.g. "http://localhost:8181".
api_options
- a complete specification of the API Options for this instance.
spawner_database
- either a Database or an AsyncDatabase instance. This represents the database class which spawns this admin object, so that, if required, a keyspace creation can retroactively "use" the new keyspace in the spawner. Used to enable the Async/Database.get_admin_database().create_keyspace() pattern.
Example
>>> from astrapy import DataAPIClient >>> from astrapy.constants import Environment >>> from astrapy.authentication import UsernamePasswordTokenProvider >>> >>> token_provider = UsernamePasswordTokenProvider("username", "password") >>> endpoint = "http://localhost:8181" >>> >>> client = DataAPIClient( >>> token=token_provider, >>> environment=Environment.OTHER, >>> ) >>> database = client.get_database(endpoint) >>> admin_for_my_db = database.get_database_admin() >>> >>> admin_for_my_db.list_keyspaces() ['keyspace1', 'keyspace2']
Note
a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered.
Expand source code
class DataAPIDatabaseAdmin(DatabaseAdmin): """ An "admin" object for non-Astra Data API environments, to perform administrative tasks at the keyspaces level such as creating/listing/dropping keyspaces. Conforming to the architecture of non-Astra deployments of the Data API, this object works within the one existing database. It is within that database that the keyspace CRUD operations (and possibly other admin operations) are performed. Since non-Astra environment lack the concept of an overall admin (such as the all-databases AstraDBAdmin class), a `DataAPIDatabaseAdmin` is generally created by invoking the `get_database_admin` method of the corresponding `Database` object (which in turn is spawned by a DataAPIClient). Args: api_endpoint: the full URI to access the Data API, e.g. "http://localhost:8181". api_options: a complete specification of the API Options for this instance. spawner_database: either a Database or an AsyncDatabase instance. This represents the database class which spawns this admin object, so that, if required, a keyspace creation can retroactively "use" the new keyspace in the spawner. Used to enable the Async/Database.get_admin_database().create_keyspace() pattern. Example: >>> from astrapy import DataAPIClient >>> from astrapy.constants import Environment >>> from astrapy.authentication import UsernamePasswordTokenProvider >>> >>> token_provider = UsernamePasswordTokenProvider("username", "password") >>> endpoint = "http://localhost:8181" >>> >>> client = DataAPIClient( >>> token=token_provider, >>> environment=Environment.OTHER, >>> ) >>> database = client.get_database(endpoint) >>> admin_for_my_db = database.get_database_admin() >>> >>> admin_for_my_db.list_keyspaces() ['keyspace1', 'keyspace2'] Note: a more powerful token may be required than the one sufficient for working in the Database, Collection and Table classes. Check the provided token if "Unauthorized" errors are encountered. """ def __init__( self, *, api_endpoint: str, api_options: FullAPIOptions, spawner_database: Database | AsyncDatabase | None = None, ) -> None: # lazy import here to avoid circular dependency from astrapy.database import Database self.api_options = api_options self.api_endpoint = api_endpoint if spawner_database is not None: self.spawner_database = spawner_database else: # leaving the keyspace to its per-environment default # (a task for the Database) self.spawner_database = Database( api_endpoint=self.api_endpoint, keyspace=None, api_options=self.api_options, ) # even if Data API, this is admin and must use the Admin additional headers: self._commander_headers = { DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token(), **self.api_options.admin_additional_headers, } self._api_commander = self._get_api_commander() def __repr__(self) -> str: parts = [ f'api_endpoint="{self.api_endpoint}"', f"api_options={self.api_options}", ] return f"{self.__class__.__name__}({', '.join(parts)})" def __eq__(self, other: Any) -> bool: if isinstance(other, DataAPIDatabaseAdmin): return all( [ self.api_endpoint == other.api_endpoint, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self) -> APICommander: base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) return api_commander def _copy( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIDatabaseAdmin: arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return DataAPIDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=final_api_options, spawner_database=self.spawner_database, ) def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIDatabaseAdmin: """ Create a clone of this DataAPIDatabaseAdmin with some changed attributes. Args: token: an access token with enough permission to perform admin tasks. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new DataAPIDatabaseAdmin instance. Example: >>> admin_for_my_other_db = admin_for_my_db.with_options( ... api_endpoint="http://10.1.1.5:8181", ... ) """ return self._copy( token=token, api_options=api_options, ) def list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the API for a list of the keyspaces in the database. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting list of keyspaces") fn_response = self._api_commander.request( payload={"findKeyspaces": {}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if "keyspaces" not in fn_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findKeyspaces API command.", raw_response=fn_response, ) else: logger.info("finished getting list of keyspaces") return fn_response["status"]["keyspaces"] # type: ignore[no-any-return] def create_keyspace( self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in the database. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. replication_options: this dictionary can specify the options about replication of the keyspace (across database nodes). If provided, it must have a structure similar to: `{"class": "SimpleStrategy", "replication_factor": 1}`. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> admin_for_my_db.create_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { k: v for k, v in { "replication": replication_options, }.items() if v } payload = { "createKeyspace": { **{"name": name}, **({"options": options} if options else {}), } } logger.info("creating keyspace") cn_response = self._api_commander.request( payload=payload, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (cn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from createKeyspace API command.", raw_response=cn_response, ) else: logger.info("finished creating keyspace") if update_db_keyspace: self.spawner_database.use_keyspace(name) def drop_keyspace( self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop (delete) a keyspace from the database. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> admin_for_my_db.drop_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("dropping keyspace") dn_response = self._api_commander.request( payload={"dropKeyspace": {"name": name}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (dn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from dropKeyspace API command.", raw_response=dn_response, ) else: logger.info("finished dropping keyspace") async def async_list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> asyncio.run(admin_for_my_db.async_list_keyspaces()) ['default_keyspace', 'staging_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting list of keyspaces, async") fn_response = await self._api_commander.async_request( payload={"findKeyspaces": {}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if "keyspaces" not in fn_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findKeyspaces API command.", raw_response=fn_response, ) else: logger.info("finished getting list of keyspaces, async") return fn_response["status"]["keyspaces"] # type: ignore[no-any-return] async def async_create_keyspace( self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in the database. Async version of the method, for use in an asyncio context. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. replication_options: this dictionary can specify the options about replication of the keyspace (across database nodes). If provided, it must have a structure similar to: `{"class": "SimpleStrategy", "replication_factor": 1}`. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> asyncio.run(admin_for_my_db.async_create_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_leyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { k: v for k, v in { "replication": replication_options, }.items() if v } payload = { "createKeyspace": { **{"name": name}, **({"options": options} if options else {}), } } logger.info("creating keyspace, async") cn_response = await self._api_commander.async_request( payload=payload, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (cn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from createKeyspace API command.", raw_response=cn_response, ) else: logger.info("finished creating keyspace, async") if update_db_keyspace: self.spawner_database.use_keyspace(name) async def async_drop_keyspace( self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop (delete) a keyspace from the database. Async version of the method, for use in an asyncio context. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['that_other_one', 'default_keyspace'] >>> asyncio.run(admin_for_my_db.async_drop_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("dropping keyspace, async") dn_response = await self._api_commander.async_request( payload={"dropKeyspace": {"name": name}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (dn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from dropKeyspace API command.", raw_response=dn_response, ) else: logger.info("finished dropping keyspace, async") def get_database( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: keyspace: an optional keyspace to set in the resulting Database. If not set, the keyspace remains unspecified and must be set later with the `use_keyspace` method. token: if supplied, is passed to the Database instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object, ready to be used for working with data and collections. Example: >>> my_db = admin_for_my_db.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection'] Note: creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. """ # lazy importing here to avoid circular dependency from astrapy import Database # this multiple-override implements the alias on timeout params resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( token=token, ), ) return Database( api_endpoint=self.api_endpoint, keyspace=keyspace, api_options=resulting_api_options, ) def get_async_database( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: keyspace: an optional keyspace to set in the resulting AsyncDatabase. If not set, the keyspace remains unspecified and must be set later with the `use_keyspace` method. token: if supplied, is passed to the AsyncDatabase instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object, ready to be used for working with data and collections. Note: creating an instance of AsyncDatabase does not trigger actual creation of the database itself, which should exist beforehand. """ return self.get_database( token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ).to_async() def find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders") fe_response = self._api_commander.request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders") return FindEmbeddingProvidersResult._from_dict(fe_response["status"]) async def async_find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders, async") fe_response = await self._api_commander.async_request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders, async") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
Ancestors
- DatabaseAdmin
- abc.ABC
Methods
async def async_create_keyspace(self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any) ‑> None
-
Create a keyspace in the database. Async version of the method, for use in an asyncio context.
Args
name
- the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op.
replication_options
- this dictionary can specify the options about
replication of the keyspace (across database nodes). If provided,
it must have a structure similar to:
{"class": "SimpleStrategy", "replication_factor": 1}
. update_db_keyspace
- if True, the
Database
orAsyncDatabase
class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> asyncio.run(admin_for_my_db.async_create_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_leyspaces() ['default_keyspace', 'that_other_one']
Expand source code
async def async_create_keyspace( self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in the database. Async version of the method, for use in an asyncio context. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. replication_options: this dictionary can specify the options about replication of the keyspace (across database nodes). If provided, it must have a structure similar to: `{"class": "SimpleStrategy", "replication_factor": 1}`. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> asyncio.run(admin_for_my_db.async_create_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_leyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { k: v for k, v in { "replication": replication_options, }.items() if v } payload = { "createKeyspace": { **{"name": name}, **({"options": options} if options else {}), } } logger.info("creating keyspace, async") cn_response = await self._api_commander.async_request( payload=payload, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (cn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from createKeyspace API command.", raw_response=cn_response, ) else: logger.info("finished creating keyspace, async") if update_db_keyspace: self.spawner_database.use_keyspace(name)
async def async_drop_keyspace(self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop (delete) a keyspace from the database. Async version of the method, for use in an asyncio context.
Args
name
- the keyspace to delete. If it does not exist in this database, an error is raised.
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> admin_for_my_db.list_keyspaces() ['that_other_one', 'default_keyspace'] >>> asyncio.run(admin_for_my_db.async_drop_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_keyspaces() ['default_keyspace']
Expand source code
async def async_drop_keyspace( self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop (delete) a keyspace from the database. Async version of the method, for use in an asyncio context. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['that_other_one', 'default_keyspace'] >>> asyncio.run(admin_for_my_db.async_drop_keyspace( ... "that_other_one" ... )) >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("dropping keyspace, async") dn_response = await self._api_commander.async_request( payload={"dropKeyspace": {"name": name}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (dn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from dropKeyspace API command.", raw_response=dn_response, ) else: logger.info("finished dropping keyspace, async")
async def async_find_embedding_providers(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> FindEmbeddingProvidersResult
-
Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A
FindEmbeddingProvidersResult
object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=…, openai, …) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), … ] ), … }Expand source code
async def async_find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Async version of the method, for use in an asyncio context. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders, async") fe_response = await self._api_commander.async_request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders, async") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
async def async_list_keyspaces(self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
Query the API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context.
Args
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Returns
A list of the keyspaces, each a string, in no particular order.
Example
>>> asyncio.run(admin_for_my_db.async_list_keyspaces()) ['default_keyspace', 'staging_keyspace']
Expand source code
async def async_list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the API for a list of the keyspaces in the database. Async version of the method, for use in an asyncio context. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> asyncio.run(admin_for_my_db.async_list_keyspaces()) ['default_keyspace', 'staging_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting list of keyspaces, async") fn_response = await self._api_commander.async_request( payload={"findKeyspaces": {}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if "keyspaces" not in fn_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findKeyspaces API command.", raw_response=fn_response, ) else: logger.info("finished getting list of keyspaces, async") return fn_response["status"]["keyspaces"] # type: ignore[no-any-return]
def create_keyspace(self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any) ‑> None
-
Create a keyspace in the database.
Args
name
- the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op.
replication_options
- this dictionary can specify the options about
replication of the keyspace (across database nodes). If provided,
it must have a structure similar to:
{"class": "SimpleStrategy", "replication_factor": 1}
. update_db_keyspace
- if True, the
Database
orAsyncDatabase
class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored.
Example
>>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> admin_for_my_db.create_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one']
Expand source code
def create_keyspace( self, name: str, *, replication_options: dict[str, Any] | None = None, update_db_keyspace: bool | None = None, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, **kwargs: Any, ) -> None: """ Create a keyspace in the database. Args: name: the keyspace name. If supplying a keyspace that exists already, the method call proceeds as usual, no errors are raised, and the whole invocation is a no-op. replication_options: this dictionary can specify the options about replication of the keyspace (across database nodes). If provided, it must have a structure similar to: `{"class": "SimpleStrategy", "replication_factor": 1}`. update_db_keyspace: if True, the `Database` or `AsyncDatabase` class that spawned this DatabaseAdmin, if any, gets updated to work on the newly-created keyspace starting when this method returns. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the creation request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] >>> admin_for_my_db.create_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) options = { k: v for k, v in { "replication": replication_options, }.items() if v } payload = { "createKeyspace": { **{"name": name}, **({"options": options} if options else {}), } } logger.info("creating keyspace") cn_response = self._api_commander.request( payload=payload, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (cn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from createKeyspace API command.", raw_response=cn_response, ) else: logger.info("finished creating keyspace") if update_db_keyspace: self.spawner_database.use_keyspace(name)
def drop_keyspace(self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop (delete) a keyspace from the database.
Args
name
- the keyspace to delete. If it does not exist in this database, an error is raised.
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored.
Example
>>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> admin_for_my_db.drop_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace']
Expand source code
def drop_keyspace( self, name: str, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop (delete) a keyspace from the database. Args: name: the keyspace to delete. If it does not exist in this database, an error is raised. keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Note: a timeout event is no guarantee at all that the deletion request has not reached the API server and is not going to be, in fact, honored. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'that_other_one'] >>> admin_for_my_db.drop_keyspace("that_other_one") >>> admin_for_my_db.list_keyspaces() ['default_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("dropping keyspace") dn_response = self._api_commander.request( payload={"dropKeyspace": {"name": name}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if (dn_response.get("status") or {}).get("ok") != 1: raise UnexpectedDataAPIResponseException( text="Faulty response from dropKeyspace API command.", raw_response=dn_response, ) else: logger.info("finished dropping keyspace")
def find_embedding_providers(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> FindEmbeddingProvidersResult
-
Query the API for the full information on available embedding providers.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A
FindEmbeddingProvidersResult
object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=…, openai, …) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), … ] ), … }Expand source code
def find_embedding_providers( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> FindEmbeddingProvidersResult: """ Query the API for the full information on available embedding providers. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A `FindEmbeddingProvidersResult` object with the complete information returned by the API about available embedding providers Example (output abridged and indented for clarity): >>> admin_for_my_db.find_embedding_providers() FindEmbeddingProvidersResult(embedding_providers=..., openai, ...) >>> admin_for_my_db.find_embedding_providers().embedding_providers { 'openai': EmbeddingProvider( display_name='OpenAI', models=[ EmbeddingProviderModel(name='text-embedding-3-small'), ... ] ), ... } """ _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("findEmbeddingProviders") fe_response = self._api_commander.request( payload={"findEmbeddingProviders": {}}, timeout_context=_TimeoutContext( request_ms=_database_admin_timeout_ms, label=_da_label ), ) if "embeddingProviders" not in fe_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findEmbeddingProviders API command.", raw_response=fe_response, ) else: logger.info("finished findEmbeddingProviders") return FindEmbeddingProvidersResult._from_dict(fe_response["status"])
def get_async_database(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections).
Args
keyspace
- an optional keyspace to set in the resulting AsyncDatabase.
If not set, the keyspace remains unspecified and must be set later
with the
use_keyspace
method. token
- if supplied, is passed to the AsyncDatabase instead of
the one set for this object. Useful if one wants to work in
a least-privilege manner, limiting the permissions for non-admin work.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
An AsyncDatabase object, ready to be used for working with data and collections.
Note
creating an instance of AsyncDatabase does not trigger actual creation of the database itself, which should exist beforehand.
Expand source code
def get_async_database( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: keyspace: an optional keyspace to set in the resulting AsyncDatabase. If not set, the keyspace remains unspecified and must be set later with the `use_keyspace` method. token: if supplied, is passed to the AsyncDatabase instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: An AsyncDatabase object, ready to be used for working with data and collections. Note: creating an instance of AsyncDatabase does not trigger actual creation of the database itself, which should exist beforehand. """ return self.get_database( token=token, keyspace=keyspace, spawn_api_options=spawn_api_options, ).to_async()
def get_database(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections).
Args
keyspace
- an optional keyspace to set in the resulting Database.
If not set, the keyspace remains unspecified and must be set later
with the
use_keyspace
method. token
- if supplied, is passed to the Database instead of
the one set for this object. Useful if one wants to work in
a least-privilege manner, limiting the permissions for non-admin work.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
A Database object, ready to be used for working with data and collections.
Example
>>> my_db = admin_for_my_db.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection']
Note
creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand.
Expand source code
def get_database( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a Database instance for a specific database, to be used when doing data-level work (such as creating/managing collections). Args: keyspace: an optional keyspace to set in the resulting Database. If not set, the keyspace remains unspecified and must be set later with the `use_keyspace` method. token: if supplied, is passed to the Database instead of the one set for this object. Useful if one wants to work in a least-privilege manner, limiting the permissions for non-admin work. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the database admin. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: A Database object, ready to be used for working with data and collections. Example: >>> my_db = admin_for_my_db.get_database() >>> my_db.list_collection_names() ['movies', 'another_collection'] Note: creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. """ # lazy importing here to avoid circular dependency from astrapy import Database # this multiple-override implements the alias on timeout params resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( token=token, ), ) return Database( api_endpoint=self.api_endpoint, keyspace=keyspace, api_options=resulting_api_options, )
def list_keyspaces(self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
Query the API for a list of the keyspaces in the database.
Args
keyspace_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
keyspace_admin_timeout_ms
. timeout_ms
- an alias for
keyspace_admin_timeout_ms
.
Returns
A list of the keyspaces, each a string, in no particular order.
Example
>>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace']
Expand source code
def list_keyspaces( self, *, keyspace_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ Query the API for a list of the keyspaces in the database. Args: keyspace_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `keyspace_admin_timeout_ms`. timeout_ms: an alias for `keyspace_admin_timeout_ms`. Returns: A list of the keyspaces, each a string, in no particular order. Example: >>> admin_for_my_db.list_keyspaces() ['default_keyspace', 'staging_keyspace'] """ _keyspace_admin_timeout_ms, _ka_label = _select_singlereq_timeout_ka( timeout_options=self.api_options.timeout_options, keyspace_admin_timeout_ms=keyspace_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting list of keyspaces") fn_response = self._api_commander.request( payload={"findKeyspaces": {}}, timeout_context=_TimeoutContext( request_ms=_keyspace_admin_timeout_ms, label=_ka_label ), ) if "keyspaces" not in fn_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findKeyspaces API command.", raw_response=fn_response, ) else: logger.info("finished getting list of keyspaces") return fn_response["status"]["keyspaces"] # type: ignore[no-any-return]
def with_options(self, *, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> DataAPIDatabaseAdmin
-
Create a clone of this DataAPIDatabaseAdmin with some changed attributes.
Args
token
- an access token with enough permission to perform admin tasks.
This can be either a literal token string or a subclass of
TokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new DataAPIDatabaseAdmin instance.
Example
>>> admin_for_my_other_db = admin_for_my_db.with_options( ... api_endpoint="http://10.1.1.5:8181", ... )
Expand source code
def with_options( self, *, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> DataAPIDatabaseAdmin: """ Create a clone of this DataAPIDatabaseAdmin with some changed attributes. Args: token: an access token with enough permission to perform admin tasks. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new DataAPIDatabaseAdmin instance. Example: >>> admin_for_my_other_db = admin_for_my_db.with_options( ... api_endpoint="http://10.1.1.5:8181", ... ) """ return self._copy( token=token, api_options=api_options, )
class Database (*, api_endpoint: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API database. This is the object for doing database-level DML, such as creating/deleting collections, and for obtaining Collection objects themselves. This class has a synchronous interface.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_database
of AstraDBClient.On Astra DB, a Database comes with an "API Endpoint", which implies a Database object instance reaches a specific region (relevant point in case of multi-region databases).
A Database is also always set with a "working keyspace" on which all data operations are done (unless otherwise specified).
Args
api_endpoint
- the full "API Endpoint" string used to reach the Data API.
Example: "https://
- .apps.astra.datastax.com" keyspace
- this is the keyspace all method calls will target, unless
one is explicitly specified in the call. If no keyspace is supplied
when creating a Database, on Astra DB the name "default_keyspace" is set,
while on other environments the keyspace is left unspecified: in this case,
most operations are unavailable until a keyspace is set (through an explicit
use_keyspace
invocation or equivalent). api_options
- a complete specification of the API Options for this instance.
Example
>>> from astrapy import DataAPIClient >>> my_client = astrapy.DataAPIClient() >>> my_db = my_client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... )
Note
creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class.
Expand source code
class Database: """ A Data API database. This is the object for doing database-level DML, such as creating/deleting collections, and for obtaining Collection objects themselves. This class has a synchronous interface. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_database` of AstraDBClient. On Astra DB, a Database comes with an "API Endpoint", which implies a Database object instance reaches a specific region (relevant point in case of multi-region databases). A Database is also always set with a "working keyspace" on which all data operations are done (unless otherwise specified). Args: api_endpoint: the full "API Endpoint" string used to reach the Data API. Example: "https://<database_id>-<region>.apps.astra.datastax.com" keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, on Astra DB the name "default_keyspace" is set, while on other environments the keyspace is left unspecified: in this case, most operations are unavailable until a keyspace is set (through an explicit `use_keyspace` invocation or equivalent). api_options: a complete specification of the API Options for this instance. Example: >>> from astrapy import DataAPIClient >>> my_client = astrapy.DataAPIClient() >>> my_db = my_client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:...", ... ) Note: creating an instance of Database does not trigger actual creation of the database itself, which should exist beforehand. To create databases, see the AstraDBAdmin class. """ def __init__( self, *, api_endpoint: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self.api_endpoint = api_endpoint.strip("/") # enforce defaults if on Astra DB: self._using_keyspace: str | None if ( keyspace is None and self.api_options.environment in Environment.astra_db_values ): self._using_keyspace = DEFAULT_ASTRA_DB_KEYSPACE else: self._using_keyspace = keyspace self._commander_headers = { DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token(), **self.api_options.database_additional_headers, } self._name: str | None = None self._api_commander = self._get_api_commander(keyspace=self.keyspace) def __getattr__(self, collection_name: str) -> Collection[DefaultDocumentType]: return self.get_collection(name=collection_name) def __getitem__(self, collection_name: str) -> Collection[DefaultDocumentType]: return self.get_collection(name=collection_name) def __repr__(self) -> str: ep_desc = f'api_endpoint="{self.api_endpoint}"' keyspace_desc: str | None if self._using_keyspace is None: keyspace_desc = "keyspace not set" else: keyspace_desc = f'keyspace="{self._using_keyspace}"' api_options_desc = f"api_options={self.api_options}" parts = [ pt for pt in [ep_desc, keyspace_desc, api_options_desc] if pt is not None ] return f"{self.__class__.__name__}({', '.join(parts)})" def __eq__(self, other: Any) -> bool: if isinstance(other, Database): return all( [ self.api_endpoint == other.api_endpoint, self.keyspace == other.keyspace, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self, keyspace: str | None) -> APICommander | None: """ Instantiate a new APICommander based on the properties of this class and a provided keyspace. If keyspace is None, return None (signaling a "keyspace not set"). """ if keyspace is None: return None else: base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, keyspace, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) return api_commander def _get_driver_commander(self, keyspace: str | None) -> APICommander: """ Building on _get_api_commander, fall back to class keyspace in creating/returning a commander, and in any case raise an error if not set. """ driver_commander: APICommander | None if keyspace: driver_commander = self._get_api_commander(keyspace=keyspace) else: driver_commander = self._api_commander if driver_commander is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return driver_commander def _copy( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Database: arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Database( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, ) def with_options( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a clone of this database with some changed attributes. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new `Database` instance. Example: >>> my_db_2 = my_db.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... ) """ return self._copy( keyspace=keyspace, token=token, api_options=api_options, ) def to_async( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: "AstraCS:xyz..." This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an `AsyncDatabase` instance. Example: >>> async_database = my_db.to_async() >>> asyncio.run(async_database.list_collection_names()) """ arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncDatabase( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, ) def use_keyspace(self, keyspace: str) -> None: """ Switch to a new working keyspace for this database. This method changes (mutates) the Database instance. Note that this method does not create the keyspace, which should exist already (created for instance with a `DatabaseAdmin.create_keyspace` call). Args: keyspace: the new keyspace to use as the database working keyspace. Returns: None. Example: >>> my_db.list_collection_names() ['coll_1', 'coll_2'] >>> my_db.use_keyspace("an_empty_keyspace") >>> my_db.list_collection_names() [] """ logger.info(f"switching to keyspace '{keyspace}'") self._using_keyspace = keyspace self._api_commander = self._get_api_commander(keyspace=self.keyspace) def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBDatabaseInfo: """ Additional information on the database as an AstraDBDatabaseInfo instance. Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> my_db.info().region 'eu-west-1' >>> my_db.info().raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z' Note: see the AstraDBDatabaseInfo documentation for a caveat about the difference between the `region` and the `raw["region"]` attributes. """ if self.api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting database info") database_info = fetch_database_info( self.api_endpoint, keyspace=self.keyspace, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if database_info is not None: logger.info("finished getting database info") return database_info else: raise DevOpsAPIException("Failure while fetching database info.") @property def id(self) -> str: """ The ID of this database. Example: >>> my_db.id '01234567-89ab-cdef-0123-456789abcdef' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.database_id else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." ) @property def region(self) -> str: """ The region where this database is located. The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint). Example: >>> my_db.region 'us-west-2' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.region else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." ) def name(self) -> str: """ The name of this database. Note that this bears no unicity guarantees. Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the `info()` method for more details. Example: >>> my_db.name() 'the_application_database' """ if self._name is None: self._name = self.info().name return self._name @property def keyspace(self) -> str | None: """ The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified. Returns: the working keyspace (a string), or None if not set. Example: >>> my_db.keyspace 'the_keyspace' """ return self._using_keyspace @overload def get_collection( self, name: str, *, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DefaultDocumentType]: ... @overload def get_collection( self, name: str, *, document_type: type[DOC], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: ... def get_collection( self, name: str, *, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Spawn a `Collection` object instance representing a collection on this database. Creating a `Collection` instance does not have any effect on the actual state of the database: in other words, for the created `Collection` instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling the `create_collection` method). Args: name: the name of the collection. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Collection is implicitly a `Collection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the collection. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a `Collection` instance, representing the desired collection (but without any form of validation). Example: >>> my_col = my_db.get_collection("my_collection") >>> my_col.count_documents({}, upper_bound=100) 41 Note: The attribute and indexing syntax forms achieve the same effect as this method. In other words, the following are equivalent: my_db.get_collection("coll_name") my_db.coll_name my_db["coll_name"] """ # lazy importing here against circular-import error from astrapy.collection import Collection resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return Collection( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, ) @overload def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DefaultDocumentType]: ... @overload def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[DOC], keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: ... def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Creates a collection on the database and return the Collection instance that represents it. This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the `get_collection` instance, this method triggers causes the collection to be actually created on DB. Args: name: the name of the collection. definition: a complete collection definition for the table. This can be an instance of `CollectionDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CollectionDefinition`. See the `astrapy.info.CollectionDefinition` class and the `Collection` class for more details and ways to construct this object. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Collection is implicitly a `Collection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the collection is to be created. If not specified, the general setting for this database is used. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a (synchronous) `Collection` instance, representing the newly-created collection. Example: >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = database.create_collection( ... "my_events", ... definition=collection_definition, ... ) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = database.create_collection( ... "my_events", ... definition=collection_definition_2, ... ) """ cc_definition: dict[str, Any] = CollectionDefinition.coerce( definition or {} ).as_dict() # this method has custom code to pick its timeout _collection_admin_timeout_ms: int _ca_label: str if collection_admin_timeout_ms is not None: _collection_admin_timeout_ms = collection_admin_timeout_ms _ca_label = "collection_admin_timeout_ms" else: _collection_admin_timeout_ms = ( self.api_options.timeout_options.collection_admin_timeout_ms ) _ca_label = "collection_admin_timeout_ms" driver_commander = self._get_driver_commander(keyspace=keyspace) cc_payload = { "createCollection": { k: v for k, v in { "name": name, "options": cc_definition, }.items() if v is not None if v != {} } } logger.info(f"createCollection('{name}')") cc_response = driver_commander.request( payload=cc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if cc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createCollection API command.", raw_response=cc_response, ) logger.info(f"finished createCollection('{name}')") return self.get_collection( name, document_type=document_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, ) def drop_collection( self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a collection from the database, along with all documents therein. Args: name: the name of the collection to drop. keyspace: the keyspace where the collection resides. If not specified, the database working keyspace is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> my_db.list_collection_names() ['a_collection', 'my_v_col', 'another_col'] >>> my_db.drop_collection("my_v_col") >>> my_db.list_collection_names() ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace driver_commander = self._get_driver_commander(keyspace=_keyspace) dc_payload = {"deleteCollection": {"name": name}} logger.info(f"deleteCollection('{name}')") dc_response = driver_commander.request( payload=dc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if dc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteCollection API command.", raw_response=dc_response, ) logger.info(f"finished deleteCollection('{name}')") return dc_response.get("status", {}) # type: ignore[no-any-return] def list_collections( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[CollectionDescriptor]: """ List all collections in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of CollectionDescriptor instances one for each collection. Example: >>> coll_list = my_db.list_collections() >>> coll_list [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] >>> for coll_dict in my_db.list_collections(): ... print(coll_dict) ... CollectionDescriptor(name='my_v_col', options=CollectionDefinition()) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_collections_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) def _list_collections_ctx( self, *, keyspace: str | None, timeout_context: _TimeoutContext, ) -> list[CollectionDescriptor]: driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload = {"findCollections": {"options": {"explain": True}}} logger.info("findCollections") gc_response = driver_commander.request( payload=gc_payload, timeout_context=timeout_context, ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: # we know this is a list of dicts, to marshal into "descriptors" logger.info("finished findCollections") return [ CollectionDescriptor._from_dict(col_dict) for col_dict in gc_response["status"]["collections"] ] def list_collection_names( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all collections in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of the collection names as strings, in no particular order. Example: >>> my_db.list_collection_names() ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload: dict[str, Any] = {"findCollections": {}} logger.info("findCollections") gc_response = driver_commander.request( payload=gc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: logger.info("finished findCollections") return gc_response["status"]["collections"] # type: ignore[no-any-return] @overload def get_table( self, name: str, *, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[DefaultRowType]: ... @overload def get_table( self, name: str, *, row_type: type[ROW], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: ... def get_table( self, name: str, *, row_type: type[Any] = DefaultRowType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Spawn a `Table` object instance representing a table on this database. Creating a `Table` instance does not have any effect on the actual state of the database: in other words, for the created `Table` instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling the `create_table` method). Args: name: the name of the table. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the table. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a `Table` instance, representing the desired table (but without any form of validation). Example: >>> # Get a Table object (and read a property of it as an example): >>> my_table = database.get_table("games") >>> my_table.full_name 'default_keyspace.games' >>> >>> # Get a Table object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_table = database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> >>> from astrapy import Table >>> MyCustomDictType = dict[str, int] >>> >>> # Get a Table object typed with a specific type for its rows: >>> my_typed_table: Table[MyCustomDictType] = database.get_table( ... "games", ... row_type=MyCustomDictType, ... ) """ # lazy importing here against circular-import error from astrapy.table import Table resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return Table[ROW]( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, ) @overload def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[DefaultRowType]: ... @overload def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[ROW], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: ... def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = DefaultRowType, keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Creates a table on the database and return the Table instance that represents it. This is a blocking operation: the method returns when the table is ready to be used. As opposed to the `get_table` method call, this method causes the table to be actually created on DB. Args: name: the name of the table. definition: a complete table definition for the table. This can be an instance of `CreateTableDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CreateTableDefinition`. See the `astrapy.info.CreateTableDefinition` class and the `Table` class for more details and ways to construct this object. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the table is to be created. If not specified, the general setting for this database is used. if_not_exists: if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a (synchronous) `Table` instance, representing the newly-created table. Example: >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = database.create_table( ... "games", ... definition=table_definition, ... ) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... ) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... ) """ ct_options: dict[str, bool] if if_not_exists is not None: ct_options = {"ifNotExists": if_not_exists} else: ct_options = {} ct_definition: dict[str, Any] = CreateTableDefinition.coerce( definition ).as_dict() _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) ct_payload = { "createTable": { k: v for k, v in { "name": name, "definition": ct_definition, "options": ct_options, }.items() if v is not None if v != {} } } logger.info(f"createTable('{name}')") ct_response = driver_commander.request( payload=ct_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ct_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createTable API command.", raw_response=ct_response, ) logger.info(f"finished createTable('{name}')") return self.get_table( name, row_type=row_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, ) def drop_table_index( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drops (deletes) an index (of any kind) from the table it is associated to. This is a blocking operation: the method returns once the index is deleted. Note: Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call. Args: name: the name of the index. keyspace: the keyspace to which the index belongs. If not specified, the general setting for this database is used. if_exists: if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # Drop an index from the keyspace: >>> database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> database.drop_table_index("score_index", if_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) di_options: dict[str, bool] if if_exists is not None: di_options = {"ifExists": if_exists} else: di_options = {} di_payload = { "dropIndex": { k: v for k, v in { "name": name, "options": di_options, }.items() if v is not None if v != {} } } driver_commander = self._get_driver_commander(keyspace=keyspace) logger.info(f"dropIndex('{name}')") di_response = driver_commander.request( payload=di_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if di_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropIndex API command.", raw_response=di_response, ) logger.info(f"finished dropIndex('{name}')") def drop_table( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a table from the database, along with all rows therein and related indexes. Args: name: the name of the table to drop. keyspace: the keyspace where the table resides. If not specified, the database working keyspace is assumed. if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> database.list_table_names() ['fighters', 'games'] >>> database.drop_table("fighters") >>> database.list_table_names() ['games'] >>> # not erroring because of if_not_exists: >>> database.drop_table("fighters", if_not_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace dt_options: dict[str, bool] if if_exists is not None: dt_options = {"ifExists": if_exists} else: dt_options = {} driver_commander = self._get_driver_commander(keyspace=_keyspace) dt_payload = { "dropTable": { k: v for k, v in { "name": name, "options": dt_options, }.items() if v is not None if v != {} } } logger.info(f"dropTable('{name}')") dt_response = driver_commander.request( payload=dt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if dt_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropTable API command.", raw_response=dt_response, ) logger.info(f"finished dropTable('{name}')") return dt_response.get("status", {}) # type: ignore[no-any-return] def list_tables( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[ListTableDescriptor]: """ List all tables in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of ListTableDescriptor instances, one for each table. Example: >>> tables = my_database.list_tables() >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1} """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_tables_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) def _list_tables_ctx( self, *, keyspace: str | None, timeout_context: _TimeoutContext, ) -> list[ListTableDescriptor]: driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload = {"listTables": {"options": {"explain": True}}} logger.info("listTables") lt_response = driver_commander.request( payload=lt_payload, timeout_context=timeout_context, ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: # we know this is a list of dicts, to marshal into "descriptors" logger.info("finished listTables") return [ ListTableDescriptor.coerce(tab_dict) for tab_dict in lt_response["status"]["tables"] ] def list_table_names( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all tables in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the table names as strings, in no particular order. Example: >>> database.list_table_names() ['fighters', 'games'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload: dict[str, Any] = {"listTables": {}} logger.info("listTables") lt_response = driver_commander.request( payload=lt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: logger.info("finished listTables") return lt_response["status"]["tables"] # type: ignore[no-any-return] def command( self, body: dict[str, Any], *, keyspace: str | None | UnsetType = _UNSET, collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this database with an arbitrary, caller-provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. keyspace: the keyspace to use, if any. If a keyspace is employed, it is used to construct the full request URL. To run a command targeting no specific keyspace (rather, the database as a whole), pass an explicit `None`: the request URL will lack the suffix "/<keyspace>" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name: if provided, the name is appended at the end of the endpoint. In this way, this method allows collection- and table-level arbitrary POST requests as well. This parameter cannot be used if `keyspace=None` is explicitly provided. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace: str | None if keyspace is None: if collection_or_table_name is not None: raise ValueError( "Cannot pass collection_or_table_name to database " "`command` on a no-keyspace command" ) _keyspace = None else: if isinstance(keyspace, UnsetType): _keyspace = self.keyspace else: _keyspace = keyspace # build the ad-hoc-commander path with _keyspace and the coll.or.table base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, _keyspace, collection_or_table_name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" command_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) _cmd_desc = ",".join(sorted(body.keys())) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") req_response = command_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") return req_response def get_database_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> DatabaseAdmin: """ Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces. This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin. Args: token: an access token with enough permission on the database to perform the desired tasks. If omitted (as it can generally be done), the token of this Database is used. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of `AstraDBDatabaseAdmin` is returned; for other environments, an instance of `DataAPIDatabaseAdmin` is returned. Example: >>> my_db_admin = my_db.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace'] """ # lazy importing here to avoid circular dependency from astrapy.admin.admin import AstraDBDatabaseAdmin, DataAPIDatabaseAdmin arg_api_options = APIOptions( token=token, ) api_options = self.api_options.with_override(spawn_api_options).with_override( arg_api_options ) if api_options.environment in Environment.astra_db_values: return AstraDBDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, ) else: return DataAPIDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, )
Instance variables
var id : str
-
The ID of this database.
Example
>>> my_db.id '01234567-89ab-cdef-0123-456789abcdef'
Expand source code
@property def id(self) -> str: """ The ID of this database. Example: >>> my_db.id '01234567-89ab-cdef-0123-456789abcdef' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.database_id else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." )
var keyspace : str | None
-
The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified.
Returns
the working keyspace (a string), or None if not set.
Example
>>> my_db.keyspace 'the_keyspace'
Expand source code
@property def keyspace(self) -> str | None: """ The keyspace this database uses as target for all commands when no method-call-specific keyspace is specified. Returns: the working keyspace (a string), or None if not set. Example: >>> my_db.keyspace 'the_keyspace' """ return self._using_keyspace
var region : str
-
The region where this database is located.
The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint).
Example
>>> my_db.region 'us-west-2'
Expand source code
@property def region(self) -> str: """ The region where this database is located. The region is still well defined in case of multi-region databases, since a Database instance connects to exactly one of the regions (as specified by the API Endpoint). Example: >>> my_db.region 'us-west-2' """ parsed_api_endpoint = parse_api_endpoint(self.api_endpoint) if parsed_api_endpoint is not None: return parsed_api_endpoint.region else: raise DevOpsAPIException( "Database is not in a supported environment for this operation." )
Methods
def command(self, body: dict[str, Any], *, keyspace: str | None | UnsetType = (unset), collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this database with an arbitrary, caller-provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
keyspace
- the keyspace to use, if any. If a keyspace is employed,
it is used to construct the full request URL. To run a command
targeting no specific keyspace (rather, the database as a whole),
pass an explicit
None
: the request URL will lack the suffix "/" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name
- if provided, the name is appended at the end
of the endpoint. In this way, this method allows collection-
and table-level arbitrary POST requests as well.
This parameter cannot be used if
keyspace=None
is explicitly provided. raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}}
Expand source code
def command( self, body: dict[str, Any], *, keyspace: str | None | UnsetType = _UNSET, collection_or_table_name: str | None = None, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this database with an arbitrary, caller-provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. keyspace: the keyspace to use, if any. If a keyspace is employed, it is used to construct the full request URL. To run a command targeting no specific keyspace (rather, the database as a whole), pass an explicit `None`: the request URL will lack the suffix "/<keyspace>" component. If unspecified, the working keyspace of this database is used. If another keyspace is passed, it will be used instead of the database's working one. collection_or_table_name: if provided, the name is appended at the end of the endpoint. In this way, this method allows collection- and table-level arbitrary POST requests as well. This parameter cannot be used if `keyspace=None` is explicitly provided. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_db.command({"findCollections": {}}) {'status': {'collections': ['my_coll']}} >>> my_db.command({"countDocuments": {}}, collection_or_table_name="my_coll") {'status': {'count': 123}} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace: str | None if keyspace is None: if collection_or_table_name is not None: raise ValueError( "Cannot pass collection_or_table_name to database " "`command` on a no-keyspace command" ) _keyspace = None else: if isinstance(keyspace, UnsetType): _keyspace = self.keyspace else: _keyspace = keyspace # build the ad-hoc-commander path with _keyspace and the coll.or.table base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self.api_options.data_api_url_options.api_path, self.api_options.data_api_url_options.api_version, _keyspace, collection_or_table_name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" command_commander = APICommander( api_endpoint=self.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, ) _cmd_desc = ",".join(sorted(body.keys())) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") req_response = command_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"command={_cmd_desc} on {self.__class__.__name__}") return req_response
def create_collection(self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Collection[DOC]
-
Creates a collection on the database and return the Collection instance that represents it.
This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the
get_collection
instance, this method triggers causes the collection to be actually created on DB.Args
name
- the name of the collection.
definition
- a complete collection definition for the table. This can be an
instance of
CollectionDefinition
or an equivalent (nested) dictionary, in which case it will be parsed into aCollectionDefinition
. See theCollectionDefinition
class and theCollection
class for more details and ways to construct this object. document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting Collection is implicitly
a
Collection[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace where the collection is to be created. If not specified, the general setting for this database is used.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply.
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
a (synchronous)
Collection
instance, representing the newly-created collection.Example
>>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = database.create_collection( ... "my_events", ... definition=collection_definition, ... )
>>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>>
>>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = database.create_collection( ... "my_events", ... definition=collection_definition_2, ... )
Expand source code
def create_collection( self, name: str, *, definition: CollectionDefinition | dict[str, Any] | None = None, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Creates a collection on the database and return the Collection instance that represents it. This is a blocking operation: the method returns when the collection is ready to be used. As opposed to the `get_collection` instance, this method triggers causes the collection to be actually created on DB. Args: name: the name of the collection. definition: a complete collection definition for the table. This can be an instance of `CollectionDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CollectionDefinition`. See the `astrapy.info.CollectionDefinition` class and the `Collection` class for more details and ways to construct this object. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Collection is implicitly a `Collection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the collection is to be created. If not specified, the general setting for this database is used. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a (synchronous) `Collection` instance, representing the newly-created collection. Example: >>> # Create a collection using the fluent syntax for its definition >>> from astrapy.constants import VectorMetric >>> from astrapy.info import CollectionDefinition >>> >>> collection_definition = ( ... CollectionDefinition.builder() ... .set_vector_dimension(3) ... .set_vector_metric(VectorMetric.DOT_PRODUCT) ... .set_indexing("deny", ["annotations", "logs"]) ... .build() ... ) >>> my_collection = database.create_collection( ... "my_events", ... definition=collection_definition, ... ) >>> >>> # Create a collection with the definition as object >>> from astrapy.info import CollectionVectorOptions >>> >>> collection_definition_1 = CollectionDefinition( ... vector=CollectionVectorOptions( ... dimension=3, ... metric=VectorMetric.DOT_PRODUCT, ... ), ... indexing={"deny": ["annotations", "logs"]}, ... ) >>> my_collection_1 = database.create_collection( ... "my_events", ... definition=collection_definition_1, ... ) >>> >>> # Create a collection with the definition as plain dictionary >>> collection_definition_2 = { ... "indexing": {"deny": ["annotations", "logs"]}, ... "vector": { ... "dimension": 3, ... "metric": VectorMetric.DOT_PRODUCT, ... }, ... } >>> my_collection_2 = database.create_collection( ... "my_events", ... definition=collection_definition_2, ... ) """ cc_definition: dict[str, Any] = CollectionDefinition.coerce( definition or {} ).as_dict() # this method has custom code to pick its timeout _collection_admin_timeout_ms: int _ca_label: str if collection_admin_timeout_ms is not None: _collection_admin_timeout_ms = collection_admin_timeout_ms _ca_label = "collection_admin_timeout_ms" else: _collection_admin_timeout_ms = ( self.api_options.timeout_options.collection_admin_timeout_ms ) _ca_label = "collection_admin_timeout_ms" driver_commander = self._get_driver_commander(keyspace=keyspace) cc_payload = { "createCollection": { k: v for k, v in { "name": name, "options": cc_definition, }.items() if v is not None if v != {} } } logger.info(f"createCollection('{name}')") cc_response = driver_commander.request( payload=cc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if cc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createCollection API command.", raw_response=cc_response, ) logger.info(f"finished createCollection('{name}')") return self.get_collection( name, document_type=document_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, )
def create_table(self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Table[ROW]
-
Creates a table on the database and return the Table instance that represents it.
This is a blocking operation: the method returns when the table is ready to be used. As opposed to the
get_table
method call, this method causes the table to be actually created on DB.Args
name
- the name of the table.
definition
- a complete table definition for the table. This can be an
instance of
CreateTableDefinition
or an equivalent (nested) dictionary, in which case it will be parsed into aCreateTableDefinition
. See theCreateTableDefinition
class and theTable
class for more details and ways to construct this object. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting Table is implicitly a
Table[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace where the table is to be created. If not specified, the general setting for this database is used.
if_not_exists
- if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
. embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
a (synchronous)
Table
instance, representing the newly-created table.Example
>>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = database.create_table( ... "games", ... definition=table_definition, ... ) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... ) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... )
Expand source code
def create_table( self, name: str, *, definition: CreateTableDefinition | dict[str, Any], row_type: type[Any] = DefaultRowType, keyspace: str | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Creates a table on the database and return the Table instance that represents it. This is a blocking operation: the method returns when the table is ready to be used. As opposed to the `get_table` method call, this method causes the table to be actually created on DB. Args: name: the name of the table. definition: a complete table definition for the table. This can be an instance of `CreateTableDefinition` or an equivalent (nested) dictionary, in which case it will be parsed into a `CreateTableDefinition`. See the `astrapy.info.CreateTableDefinition` class and the `Table` class for more details and ways to construct this object. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace where the table is to be created. If not specified, the general setting for this database is used. if_not_exists: if set to True, the command will succeed even if a table with the specified name already exists (in which case no actual table creation takes place on the database). Defaults to False, i.e. an error is raised by the API in case of table-name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a (synchronous) `Table` instance, representing the newly-created table. Example: >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = database.create_table( ... "games", ... definition=table_definition, ... ) >>> >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... ) >>> >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... ) """ ct_options: dict[str, bool] if if_not_exists is not None: ct_options = {"ifNotExists": if_not_exists} else: ct_options = {} ct_definition: dict[str, Any] = CreateTableDefinition.coerce( definition ).as_dict() _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) ct_payload = { "createTable": { k: v for k, v in { "name": name, "definition": ct_definition, "options": ct_options, }.items() if v is not None if v != {} } } logger.info(f"createTable('{name}')") ct_response = driver_commander.request( payload=ct_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ct_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from createTable API command.", raw_response=ct_response, ) logger.info(f"finished createTable('{name}')") return self.get_table( name, row_type=row_type, keyspace=keyspace, embedding_api_key=embedding_api_key, spawn_api_options=spawn_api_options, )
def drop_collection(self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop a collection from the database, along with all documents therein.
Args
name
- the name of the collection to drop.
keyspace
- the keyspace where the collection resides. If not specified, the database working keyspace is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Example
>>> my_db.list_collection_names() ['a_collection', 'my_v_col', 'another_col'] >>> my_db.drop_collection("my_v_col") >>> my_db.list_collection_names() ['a_collection', 'another_col']
Expand source code
def drop_collection( self, name: str, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a collection from the database, along with all documents therein. Args: name: the name of the collection to drop. keyspace: the keyspace where the collection resides. If not specified, the database working keyspace is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Example: >>> my_db.list_collection_names() ['a_collection', 'my_v_col', 'another_col'] >>> my_db.drop_collection("my_v_col") >>> my_db.list_collection_names() ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace driver_commander = self._get_driver_commander(keyspace=_keyspace) dc_payload = {"deleteCollection": {"name": name}} logger.info(f"deleteCollection('{name}')") dc_response = driver_commander.request( payload=dc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if dc_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteCollection API command.", raw_response=dc_response, ) logger.info(f"finished deleteCollection('{name}')") return dc_response.get("status", {}) # type: ignore[no-any-return]
def drop_table(self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop a table from the database, along with all rows therein and related indexes.
Args
name
- the name of the table to drop.
keyspace
- the keyspace where the table resides. If not specified, the database working keyspace is assumed.
if_exists
- if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> database.list_table_names() ['fighters', 'games'] >>> database.drop_table("fighters") >>> database.list_table_names() ['games'] >>> # not erroring because of if_not_exists: >>> database.drop_table("fighters", if_not_exists=True)
Expand source code
def drop_table( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop a table from the database, along with all rows therein and related indexes. Args: name: the name of the table to drop. keyspace: the keyspace where the table resides. If not specified, the database working keyspace is assumed. if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> database.list_table_names() ['fighters', 'games'] >>> database.drop_table("fighters") >>> database.list_table_names() ['games'] >>> # not erroring because of if_not_exists: >>> database.drop_table("fighters", if_not_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _keyspace = keyspace or self.keyspace dt_options: dict[str, bool] if if_exists is not None: dt_options = {"ifExists": if_exists} else: dt_options = {} driver_commander = self._get_driver_commander(keyspace=_keyspace) dt_payload = { "dropTable": { k: v for k, v in { "name": name, "options": dt_options, }.items() if v is not None if v != {} } } logger.info(f"dropTable('{name}')") dt_response = driver_commander.request( payload=dt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if dt_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropTable API command.", raw_response=dt_response, ) logger.info(f"finished dropTable('{name}')") return dt_response.get("status", {}) # type: ignore[no-any-return]
def drop_table_index(self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drops (deletes) an index (of any kind) from the table it is associated to.
This is a blocking operation: the method returns once the index is deleted.
Note
Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call.
Args
name
- the name of the index.
keyspace
- the keyspace to which the index belongs. If not specified, the general setting for this database is used.
if_exists
- if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # Drop an index from the keyspace: >>> database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> database.drop_table_index("score_index", if_exists=True)
Expand source code
def drop_table_index( self, name: str, *, keyspace: str | None = None, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drops (deletes) an index (of any kind) from the table it is associated to. This is a blocking operation: the method returns once the index is deleted. Note: Although associated to a table, index names are unique across a keyspace. For this reason, no table name is required in this call. Args: name: the name of the index. keyspace: the keyspace to which the index belongs. If not specified, the general setting for this database is used. if_exists: if passed as True, trying to drop a non-existing index will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # Drop an index from the keyspace: >>> database.drop_table_index("score_index") >>> # Drop an index, unless it does not exist already: >>> database.drop_table_index("score_index", if_exists=True) """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) di_options: dict[str, bool] if if_exists is not None: di_options = {"ifExists": if_exists} else: di_options = {} di_payload = { "dropIndex": { k: v for k, v in { "name": name, "options": di_options, }.items() if v is not None if v != {} } } driver_commander = self._get_driver_commander(keyspace=keyspace) logger.info(f"dropIndex('{name}')") di_response = driver_commander.request( payload=di_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if di_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from dropIndex API command.", raw_response=di_response, ) logger.info(f"finished dropIndex('{name}')")
def get_collection(self, name: str, *, document_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Collection[DOC]
-
Spawn a
Collection
object instance representing a collection on this database.Creating a
Collection
instance does not have any effect on the actual state of the database: in other words, for the createdCollection
instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling thecreate_collection
method).Args
name
- the name of the collection.
document_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting Collection is implicitly
a
Collection[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace containing the collection. If no keyspace is specified, the general setting for this database is used.
embedding_api_key
- optional API key(s) for interacting with the collection.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
a
Collection
instance, representing the desired collection (but without any form of validation).Example
>>> my_col = my_db.get_collection("my_collection") >>> my_col.count_documents({}, upper_bound=100) 41
Note
The attribute and indexing syntax forms achieve the same effect as this method. In other words, the following are equivalent: my_db.get_collection("coll_name") my_db.coll_name my_db["coll_name"]
Expand source code
def get_collection( self, name: str, *, document_type: type[Any] = DefaultDocumentType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Collection[DOC]: """ Spawn a `Collection` object instance representing a collection on this database. Creating a `Collection` instance does not have any effect on the actual state of the database: in other words, for the created `Collection` instance to be used meaningfully, the collection must exist already (for instance, it should have been created previously by calling the `create_collection` method). Args: name: the name of the collection. document_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Collection is implicitly a `Collection[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the collection. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the collection. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the collection, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a `Collection` instance, representing the desired collection (but without any form of validation). Example: >>> my_col = my_db.get_collection("my_collection") >>> my_col.count_documents({}, upper_bound=100) 41 Note: The attribute and indexing syntax forms achieve the same effect as this method. In other words, the following are equivalent: my_db.get_collection("coll_name") my_db.coll_name my_db["coll_name"] """ # lazy importing here against circular-import error from astrapy.collection import Collection resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return Collection( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, )
def get_database_admin(self, *, token: str | TokenProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> DatabaseAdmin
-
Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces.
This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin.
Args
token
- an access token with enough permission on the database to
perform the desired tasks. If omitted (as it can generally be done),
the token of this Database is used.
This can be either a literal token string or a subclass of
TokenProvider
. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings.
Returns
A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of
AstraDBDatabaseAdmin
is returned; for other environments, an instance ofDataAPIDatabaseAdmin
is returned.Example
>>> my_db_admin = my_db.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace']
Expand source code
def get_database_admin( self, *, token: str | TokenProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> DatabaseAdmin: """ Return a DatabaseAdmin object corresponding to this database, for use in admin tasks such as managing keyspaces. This method, depending on the environment where the database resides, returns an appropriate subclass of DatabaseAdmin. Args: token: an access token with enough permission on the database to perform the desired tasks. If omitted (as it can generally be done), the token of this Database is used. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults. This allows for a deeper configuration of the database admin, e.g. concerning timeouts; if this is passed together with the equivalent named parameters, the latter will take precedence in their respective settings. Returns: A DatabaseAdmin instance targeting this database. More precisely, for Astra DB an instance of `AstraDBDatabaseAdmin` is returned; for other environments, an instance of `DataAPIDatabaseAdmin` is returned. Example: >>> my_db_admin = my_db.get_database_admin() >>> if "new_keyspace" not in my_db_admin.list_keyspaces(): ... my_db_admin.create_keyspace("new_keyspace") >>> my_db_admin.list_keyspaces() ['default_keyspace', 'new_keyspace'] """ # lazy importing here to avoid circular dependency from astrapy.admin.admin import AstraDBDatabaseAdmin, DataAPIDatabaseAdmin arg_api_options = APIOptions( token=token, ) api_options = self.api_options.with_override(spawn_api_options).with_override( arg_api_options ) if api_options.environment in Environment.astra_db_values: return AstraDBDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, ) else: return DataAPIDatabaseAdmin( api_endpoint=self.api_endpoint, api_options=api_options, spawner_database=self, )
def get_table(self, name: str, *, row_type: type[Any] = dict[str, typing.Any], keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), spawn_api_options: APIOptions | UnsetType = (unset)) ‑> Table[ROW]
-
Spawn a
Table
object instance representing a table on this database.Creating a
Table
instance does not have any effect on the actual state of the database: in other words, for the createdTable
instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling thecreate_table
method).Args
name
- the name of the table.
row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting Table is implicitly a
Table[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace
- the keyspace containing the table. If no keyspace is specified, the general setting for this database is used.
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. spawn_api_options
- a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings.
Returns
a
Table
instance, representing the desired table (but without any form of validation).Example
>>> # Get a Table object (and read a property of it as an example): >>> my_table = database.get_table("games") >>> my_table.full_name 'default_keyspace.games' >>> >>> # Get a Table object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_table = database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> >>> from astrapy import Table >>> MyCustomDictType = dict[str, int] >>> >>> # Get a Table object typed with a specific type for its rows: >>> my_typed_table: Table[MyCustomDictType] = database.get_table( ... "games", ... row_type=MyCustomDictType, ... )
Expand source code
def get_table( self, name: str, *, row_type: type[Any] = DefaultRowType, keyspace: str | None = None, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, spawn_api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Spawn a `Table` object instance representing a table on this database. Creating a `Table` instance does not have any effect on the actual state of the database: in other words, for the created `Table` instance to be used meaningfully, the table must exist already (for instance, it should have been created previously by calling the `create_table` method). Args: name: the name of the table. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. keyspace: the keyspace containing the table. If no keyspace is specified, the general setting for this database is used. embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. spawn_api_options: a specification - complete or partial - of the API Options to override the defaults inherited from the Database. This allows for a deeper configuration of the table, e.g. concerning timeouts; if this is passed together with the named timeout parameters, the latter will take precedence in their respective settings. Returns: a `Table` instance, representing the desired table (but without any form of validation). Example: >>> # Get a Table object (and read a property of it as an example): >>> my_table = database.get_table("games") >>> my_table.full_name 'default_keyspace.games' >>> >>> # Get a Table object in a specific keyspace, >>> # and set an embedding API key to it: >>> my_other_table = database.get_table( ... "tournaments", ... keyspace="the_other_keyspace", ... embedding_api_key="secret-012abc...", ... ) >>> >>> from astrapy import Table >>> MyCustomDictType = dict[str, int] >>> >>> # Get a Table object typed with a specific type for its rows: >>> my_typed_table: Table[MyCustomDictType] = database.get_table( ... "games", ... row_type=MyCustomDictType, ... ) """ # lazy importing here against circular-import error from astrapy.table import Table resulting_api_options = self.api_options.with_override( spawn_api_options, ).with_override( APIOptions( embedding_api_key=embedding_api_key, ), ) _keyspace = keyspace or self.keyspace if _keyspace is None: raise ValueError( "No keyspace specified. This operation requires a keyspace to " "be set, e.g. through the `use_keyspace` method." ) return Table[ROW]( database=self, name=name, keyspace=_keyspace, api_options=resulting_api_options, )
def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> AstraDBDatabaseInfo
-
Additional information on the database as an AstraDBDatabaseInfo instance.
Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API.
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Example
>>> my_db.info().region 'eu-west-1'
>>> my_db.info().raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z'
Note
see the AstraDBDatabaseInfo documentation for a caveat about the difference between the
region
and theraw["region"]
attributes.Expand source code
def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> AstraDBDatabaseInfo: """ Additional information on the database as an AstraDBDatabaseInfo instance. Some of the returned properties are dynamic throughout the lifetime of the database (such as raw_info["keyspaces"]). For this reason, each invocation of this method triggers a new request to the DevOps API. Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Example: >>> my_db.info().region 'eu-west-1' >>> my_db.info().raw_info['datacenters'][0]['dateCreated'] '2023-01-30T12:34:56Z' Note: see the AstraDBDatabaseInfo documentation for a caveat about the difference between the `region` and the `raw["region"]` attributes. """ if self.api_options.environment not in Environment.astra_db_values: raise InvalidEnvironmentException( "Environments outside of Astra DB are not supported." ) _database_admin_timeout_ms, _da_label = _select_singlereq_timeout_da( timeout_options=self.api_options.timeout_options, database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info("getting database info") database_info = fetch_database_info( self.api_endpoint, keyspace=self.keyspace, request_timeout_ms=_database_admin_timeout_ms, api_options=self.api_options, ) if database_info is not None: logger.info("finished getting database info") return database_info else: raise DevOpsAPIException("Failure while fetching database info.")
def list_collection_names(self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all collections in a given keyspace of this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a list of the collection names as strings, in no particular order.
Example
>>> my_db.list_collection_names() ['a_collection', 'another_col']
Expand source code
def list_collection_names( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all collections in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of the collection names as strings, in no particular order. Example: >>> my_db.list_collection_names() ['a_collection', 'another_col'] """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) gc_payload: dict[str, Any] = {"findCollections": {}} logger.info("findCollections") gc_response = driver_commander.request( payload=gc_payload, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), ) if "collections" not in gc_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from findCollections API command.", raw_response=gc_response, ) else: logger.info("finished findCollections") return gc_response["status"]["collections"] # type: ignore[no-any-return]
def list_collections(self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[CollectionDescriptor]
-
List all collections in a given keyspace for this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
collection_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
collection_admin_timeout_ms
. timeout_ms
- an alias for
collection_admin_timeout_ms
.
Returns
a list of CollectionDescriptor instances one for each collection.
Example
>>> coll_list = my_db.list_collections() >>> coll_list [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] >>> for coll_dict in my_db.list_collections(): ... print(coll_dict) ... CollectionDescriptor(name='my_v_col', options=CollectionDefinition())
Expand source code
def list_collections( self, *, keyspace: str | None = None, collection_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[CollectionDescriptor]: """ List all collections in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. collection_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `collection_admin_timeout_ms`. timeout_ms: an alias for `collection_admin_timeout_ms`. Returns: a list of CollectionDescriptor instances one for each collection. Example: >>> coll_list = my_db.list_collections() >>> coll_list [CollectionDescriptor(name='my_v_col', options=CollectionDefinition())] >>> for coll_dict in my_db.list_collections(): ... print(coll_dict) ... CollectionDescriptor(name='my_v_col', options=CollectionDefinition()) """ _collection_admin_timeout_ms, _ca_label = _select_singlereq_timeout_ca( timeout_options=self.api_options.timeout_options, collection_admin_timeout_ms=collection_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_collections_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_collection_admin_timeout_ms, label=_ca_label ), )
def list_table_names(self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all tables in a given keyspace of this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of the table names as strings, in no particular order.
Example
>>> database.list_table_names() ['fighters', 'games']
Expand source code
def list_table_names( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all tables in a given keyspace of this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the table names as strings, in no particular order. Example: >>> database.list_table_names() ['fighters', 'games'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) driver_commander = self._get_driver_commander(keyspace=keyspace) lt_payload: dict[str, Any] = {"listTables": {}} logger.info("listTables") lt_response = driver_commander.request( payload=lt_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "tables" not in lt_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listTables API command.", raw_response=lt_response, ) else: logger.info("finished listTables") return lt_response["status"]["tables"] # type: ignore[no-any-return]
def list_tables(self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[ListTableDescriptor]
-
List all tables in a given keyspace for this database.
Args
keyspace
- the keyspace to be inspected. If not specified, the general setting for this database is assumed.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of ListTableDescriptor instances, one for each table.
Example
>>> tables = my_database.list_tables() >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1}
Expand source code
def list_tables( self, *, keyspace: str | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[ListTableDescriptor]: """ List all tables in a given keyspace for this database. Args: keyspace: the keyspace to be inspected. If not specified, the general setting for this database is assumed. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of ListTableDescriptor instances, one for each table. Example: >>> tables = my_database.list_tables() >>> tables [ListTableDescriptor(name='fighters', definition=ListTableDefinition(... >>> tables[1].name 'games' >>> tables[1].definition.columns {'match_id': TableScalarColumnTypeDescriptor(ColumnType.TEXT),... >>> tables[1].definition.columns['score'] TableScalarColumnTypeDescriptor(ColumnType.INT) >>> tables[1].definition.primary_key.partition_by ['match_id'] >>> tables[1].definition.primary_key.partition_sort {'round': 1} """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) return self._list_tables_ctx( keyspace=keyspace, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), )
def name(self) ‑> str
-
The name of this database. Note that this bears no unicity guarantees.
Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the
astrapy.info
method for more details.Example
>>> my_db.name() 'the_application_database'
Expand source code
def name(self) -> str: """ The name of this database. Note that this bears no unicity guarantees. Calling this method the first time involves a request to the DevOps API (the resulting database name is then cached). See the `info()` method for more details. Example: >>> my_db.name() 'the_application_database' """ if self._name is None: self._name = self.info().name return self._name
def to_async(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncDatabase
-
Create an AsyncDatabase from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy.
Args
keyspace
- this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set.
token
- an Access Token to the database. Example: "AstraCS:xyz…"
This can be either a literal token string or a subclass of
TokenProvider
. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, an
AsyncDatabase
instance.Example
>>> async_database = my_db.to_async() >>> asyncio.run(async_database.list_collection_names())
Expand source code
def to_async( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncDatabase: """ Create an AsyncDatabase from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this database in the copy. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: "AstraCS:xyz..." This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an `AsyncDatabase` instance. Example: >>> async_database = my_db.to_async() >>> asyncio.run(async_database.list_collection_names()) """ arg_api_options = APIOptions( token=token, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncDatabase( api_endpoint=self.api_endpoint, keyspace=keyspace or self.keyspace, api_options=final_api_options, )
def use_keyspace(self, keyspace: str) ‑> None
-
Switch to a new working keyspace for this database. This method changes (mutates) the Database instance.
Note that this method does not create the keyspace, which should exist already (created for instance with a
DatabaseAdmin.create_keyspace
call).Args
keyspace
- the new keyspace to use as the database working keyspace.
Returns
None.
Example
>>> my_db.list_collection_names() ['coll_1', 'coll_2'] >>> my_db.use_keyspace("an_empty_keyspace") >>> my_db.list_collection_names() []
Expand source code
def use_keyspace(self, keyspace: str) -> None: """ Switch to a new working keyspace for this database. This method changes (mutates) the Database instance. Note that this method does not create the keyspace, which should exist already (created for instance with a `DatabaseAdmin.create_keyspace` call). Args: keyspace: the new keyspace to use as the database working keyspace. Returns: None. Example: >>> my_db.list_collection_names() ['coll_1', 'coll_2'] >>> my_db.use_keyspace("an_empty_keyspace") >>> my_db.list_collection_names() [] """ logger.info(f"switching to keyspace '{keyspace}'") self._using_keyspace = keyspace self._api_commander = self._get_api_commander(keyspace=self.keyspace)
def with_options(self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Database
-
Create a clone of this database with some changed attributes.
Args
keyspace
- this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set.
token
- an Access Token to the database. Example:
"AstraCS:xyz..."
. This can be either a literal token string or a subclass ofTokenProvider
. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new
Database
instance.Example
>>> my_db_2 = my_db.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... )
Expand source code
def with_options( self, *, keyspace: str | None = None, token: str | TokenProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Database: """ Create a clone of this database with some changed attributes. Args: keyspace: this is the keyspace all method calls will target, unless one is explicitly specified in the call. If no keyspace is supplied when creating a Database, the name "default_keyspace" is set. token: an Access Token to the database. Example: `"AstraCS:xyz..."`. This can be either a literal token string or a subclass of `astrapy.authentication.TokenProvider`. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new `Database` instance. Example: >>> my_db_2 = my_db.with_options( ... keyspace="the_other_keyspace", ... token="AstraCS:xyz...", ... ) """ return self._copy( keyspace=keyspace, token=token, api_options=api_options, )
class Table (*, database: Database, name: str, keyspace: str | None, api_options: FullAPIOptions)
-
A Data API table, the object to interact with the Data API for structured data, especially for DDL operations. This class has a synchronous interface.
This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as
get_table
of Database, wherefrom the Table inherits its API options such as authentication token and API endpoint. In order to create a table, instead, one should call thecreate_table
method of Database, providing a table definition parameter that can be built in different ways (see theCreateTableDefinition
object and examples below).Args
database
- a Database object, instantiated earlier. This represents the database the table belongs to.
name
- the table name. This parameter should match an existing table on the database.
keyspace
- this is the keyspace to which the table belongs. If nothing is specified, the database's working keyspace is used.
api_options
- a complete specification of the API Options for this instance.
Examples
>>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> database = client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... ) >>>
>>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = database.create_table( ... "games", ... definition=table_definition, ... )
>>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... )
>>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... )
>>> # Get a reference to an existing table >>> # (no checks are performed on DB) >>> my_table_3 = database.get_table("games")
Note
creating an instance of Table does not trigger, in itself, actual creation of the table on the database. The latter should have been created beforehand, e.g. through the
create_table
method of a Database.Expand source code
class Table(Generic[ROW]): """ A Data API table, the object to interact with the Data API for structured data, especially for DDL operations. This class has a synchronous interface. This class is not meant for direct instantiation by the user, rather it is obtained by invoking methods such as `get_table` of Database, wherefrom the Table inherits its API options such as authentication token and API endpoint. In order to create a table, instead, one should call the `create_table` method of Database, providing a table definition parameter that can be built in different ways (see the `CreateTableDefinition` object and examples below). Args: database: a Database object, instantiated earlier. This represents the database the table belongs to. name: the table name. This parameter should match an existing table on the database. keyspace: this is the keyspace to which the table belongs. If nothing is specified, the database's working keyspace is used. api_options: a complete specification of the API Options for this instance. Examples: >>> from astrapy import DataAPIClient >>> client = DataAPIClient() >>> database = client.get_database( ... "https://01234567-....apps.astra.datastax.com", ... token="AstraCS:..." ... ) >>> >>> # Create a table using the fluent syntax for definition >>> from astrapy.constants import SortMode >>> from astrapy.info import ( ... CreateTableDefinition, ... ColumnType, ... ) >>> table_definition = ( ... CreateTableDefinition.builder() ... .add_column("match_id", ColumnType.TEXT) ... .add_column("round", ColumnType.INT) ... .add_vector_column("m_vector", dimension=3) ... .add_column("score", ColumnType.INT) ... .add_column("when", ColumnType.TIMESTAMP) ... .add_column("winner", ColumnType.TEXT) ... .add_set_column("fighters", ColumnType.UUID) ... .add_partition_by(["match_id"]) ... .add_partition_sort({"round": SortMode.ASCENDING}) ... .build() ... ) >>> my_table = database.create_table( ... "games", ... definition=table_definition, ... ) >>> # Create a table with the definition as object >>> # (and do not raise an error if the table exists already) >>> from astrapy.info import ( ... CreateTableDefinition, ... TablePrimaryKeyDescriptor, ... TableScalarColumnTypeDescriptor, ... TableValuedColumnType, ... TableValuedColumnTypeDescriptor, ... TableVectorColumnTypeDescriptor, ... ) >>> table_definition_1 = CreateTableDefinition( ... columns={ ... "match_id": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "round": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "m_vector": TableVectorColumnTypeDescriptor( ... column_type="vector", dimension=3 ... ), ... "score": TableScalarColumnTypeDescriptor( ... ColumnType.INT, ... ), ... "when": TableScalarColumnTypeDescriptor( ... ColumnType.TIMESTAMP, ... ), ... "winner": TableScalarColumnTypeDescriptor( ... ColumnType.TEXT, ... ), ... "fighters": TableValuedColumnTypeDescriptor( ... column_type=TableValuedColumnType.SET, ... value_type=ColumnType.UUID, ... ), ... }, ... primary_key=TablePrimaryKeyDescriptor( ... partition_by=["match_id"], ... partition_sort={"round": SortMode.ASCENDING}, ... ), ... ) >>> my_table_1 = database.create_table( ... "games", ... definition=table_definition_1, ... if_not_exists=True, ... ) >>> # Create a table with the definition as plain dictionary >>> # (and do not raise an error if the table exists already) >>> table_definition_2 = { ... "columns": { ... "match_id": {"type": "text"}, ... "round": {"type": "int"}, ... "m_vector": {"type": "vector", "dimension": 3}, ... "score": {"type": "int"}, ... "when": {"type": "timestamp"}, ... "winner": {"type": "text"}, ... "fighters": {"type": "set", "valueType": "uuid"}, ... }, ... "primaryKey": { ... "partitionBy": ["match_id"], ... "partitionSort": {"round": 1}, ... }, ... } >>> my_table_2 = database.create_table( ... "games", ... definition=table_definition_2, ... if_not_exists=True, ... ) >>> # Get a reference to an existing table >>> # (no checks are performed on DB) >>> my_table_3 = database.get_table("games") Note: creating an instance of Table does not trigger, in itself, actual creation of the table on the database. The latter should have been created beforehand, e.g. through the `create_table` method of a Database. """ def __init__( self, *, database: Database, name: str, keyspace: str | None, api_options: FullAPIOptions, ) -> None: self.api_options = api_options self._name = name _keyspace = keyspace if keyspace is not None else database.keyspace if _keyspace is None: raise ValueError("Attempted to create Table with 'keyspace' unset.") self._database = database._copy( keyspace=_keyspace, api_options=self.api_options ) self._commander_headers = { **{DEFAULT_DATA_API_AUTH_HEADER: self.api_options.token.get_token()}, **self.api_options.embedding_api_key.get_headers(), **self.api_options.database_additional_headers, } self._api_commander = self._get_api_commander() self._converter_agent: _TableConverterAgent[ROW] = _TableConverterAgent( options=self.api_options.serdes_options, ) def __repr__(self) -> str: _db_desc = f'database.api_endpoint="{self.database.api_endpoint}"' return ( f'{self.__class__.__name__}(name="{self.name}", ' f'keyspace="{self.keyspace}", {_db_desc}, ' f"api_options={self.api_options})" ) def __eq__(self, other: Any) -> bool: if isinstance(other, Table): return all( [ self._name == other._name, self._database == other._database, self.api_options == other.api_options, ] ) else: return False def _get_api_commander(self) -> APICommander: """Instantiate a new APICommander based on the properties of this class.""" if self._database.keyspace is None: raise ValueError( "No keyspace specified. Table requires a keyspace to " "be set, e.g. through the `keyspace` constructor parameter." ) base_path_components = [ comp for comp in ( ncomp.strip("/") for ncomp in ( self._database.api_options.data_api_url_options.api_path, self._database.api_options.data_api_url_options.api_version, self._database.keyspace, self._name, ) if ncomp is not None ) if comp != "" ] base_path = f"/{'/'.join(base_path_components)}" api_commander = APICommander( api_endpoint=self._database.api_endpoint, path=base_path, headers=self._commander_headers, callers=self.api_options.callers, redacted_header_names=self.api_options.redacted_header_names, handle_decimals_writes=True, handle_decimals_reads=True, ) return api_commander def _copy( self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return Table( database=self.database, name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def with_options( self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Create a clone of this table with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new Table instance. Example: >>> table_with_api_key_configured = my_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, ) def to_async( self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Create an AsyncTable from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an AsyncTable instance. Example: >>> asyncio.run(my_table.to_async().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... )) {"pk": 1, "column": "value} """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncTable( database=self.database.to_async(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, ) def definition( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ListTableDefinition: """ Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a `create_table` method call. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: A `ListTableDefinition` object, available for inspection. Example: >>> my_table.definition() ListTableDefinition(columns=[match_id,round,fighters, ... # shortened """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting tables in search of '{self.name}'") self_descriptors = [ table_desc for table_desc in self.database._list_tables_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label, ), ) if table_desc.name == self.name ] logger.info(f"finished getting tables in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Table {self.keyspace}.{self.name} not found.", ) def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInfo: """ Return information on the table. This should not be confused with the table definition (i.e. the schema). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A TableInfo object for inspection. Example: >>> # Note: output reformatted for clarity. >>> my_table.info() TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' ) """ return TableInfo( database_info=self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ), keyspace=self.keyspace, name=self.name, full_name=self.full_name, ) @property def database(self) -> Database: """ a Database object, the database this table belongs to. Example: >>> my_table.database.name 'the_db' """ return self._database @property def keyspace(self) -> str: """ The keyspace this table is in. Example: >>> my_table.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The table's DB is set with keyspace=None") return _keyspace @property def name(self) -> str: """ The name of this table. Example: >>> my_table.name 'games' """ return self._name @property def full_name(self) -> str: """ The fully-qualified table name within the database, in the form "keyspace.table_name". Example: >>> my_table.full_name 'default_keyspace.my_table' """ return f"{self.keyspace}.{self.name}" def _create_generic_index( self, i_name: str, ci_definition: dict[str, Any], ci_command: str, if_not_exists: bool | None, table_admin_timeout_ms: int | None, request_timeout_ms: int | None, timeout_ms: int | None, ) -> None: ci_options: dict[str, bool] if if_not_exists is not None: ci_options = {"ifNotExists": if_not_exists} else: ci_options = {} _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ci_payload = { ci_command: { "name": i_name, "definition": ci_definition, "options": ci_options, } } logger.info(f"{ci_command}('{i_name}')") ci_response = self._api_commander.request( payload=ci_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if ci_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text=f"Faulty response from {ci_command} API command.", raw_response=ci_response, ) logger.info(f"finished {ci_command}('{i_name}')") def create_index( self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create an index on a non-vector column of the table. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a vector index, see method `create_vector_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column on which the index is to be created. options: if passed, it must be an instance of `TableIndexOptions`, or an equivalent dictionary, which specifies index settings such as -- for a text column -- case-sensitivity and so on. See the `astrapy.info.TableIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> my_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> my_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... ) """ ci_definition: dict[str, Any] = TableIndexDefinition( column=column, options=TableIndexOptions.coerce(options or {}), ).as_dict() ci_command = "createIndex" return self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) def create_vector_index( self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create a vector index on a vector column of the table, enabling vector similarity search operations on it. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a non-vector index, see method `create_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column, of type "vector" on which to create the index. options: an instance of `TableVectorIndexOptions`, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See the `astrapy.info.TableVectorIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... ) """ ci_definition: dict[str, Any] = TableVectorIndexDefinition( column=column, options=TableVectorIndexOptions.coerce(options), ).as_dict() ci_command = "createVectorIndex" return self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) def list_index_names( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the index names as strings, in no particular order. Example: >>> my_table.list_index_names() ['m_vector_index', 'winner_index', 'score_index'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {}}} logger.info("listIndexes") li_response = self._api_commander.request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return li_response["status"]["indexes"] # type: ignore[no-any-return] def list_indexes( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[TableIndexDescriptor]: """ List the full definitions of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of `astrapy.info.TableIndexDescriptor` objects in no particular order, each providing the details of an index present on the table. Example: >>> indexes = my_table.list_indexes() >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] # Note: shortened >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {"explain": True}}} logger.info("listIndexes") li_response = self._api_commander.request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return [ TableIndexDescriptor.coerce(index_object) for index_object in li_response["status"]["indexes"] ] @overload def alter( self, operation: AlterTableOperation | dict[str, Any], *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> Table[DefaultRowType]: ... @overload def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[NEW_ROW], table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> Table[NEW_ROW]: ... def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = DefaultRowType, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> Table[NEW_ROW]: """ Executes one of the available alter-table operations on this table, such as adding/dropping columns. This is a blocking operation: the method returns once the index is created and ready to use. Args: operation: an instance of one of the `astrapy.info.AlterTable*` classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": ...}}. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import Table >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: Table[MyMatch] = new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... ) """ n_operation: AlterTableOperation if isinstance(operation, AlterTableOperation): n_operation = operation else: n_operation = AlterTableOperation.from_full_dict(operation) _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) at_operation_name = n_operation._name at_payload = { "alterTable": { "operation": { at_operation_name: n_operation.as_dict(), }, }, } logger.info(f"alterTable({at_operation_name})") at_response = self._api_commander.request( payload=at_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if at_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from alterTable API command.", raw_response=at_response, ) logger.info(f"finished alterTable({at_operation_name})") return Table( database=self.database, name=self.name, keyspace=self.keyspace, api_options=self.api_options, ) def insert_one( self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertOneResult: """ Insert a single row in the table, with implied overwrite in case of primary key collision. Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: row: a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple. Examples: >>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... ) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ... """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = self._converter_agent.preprocess_payload( {"insertOne": {"document": row}} ) logger.info(f"insertOne on '{self.name}'") io_response = self._api_commander.request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if not io_response["status"]["insertedIds"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'insertedIds'.", raw_response=io_response, ) if not io_response["status"]["primaryKeySchema"]: raise UnexpectedDataAPIResponseException( text=( "Response from insertOne API command has " "empty 'primaryKeySchema'." ), raw_response=io_response, ) inserted_id_list = io_response["status"]["insertedIds"][0] inserted_id_tuple, inserted_id = self._converter_agent.postprocess_key( inserted_id_list, primary_key_schema_dict=io_response["status"]["primaryKeySchema"], ) return TableInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, inserted_id_tuple=inserted_id_tuple, ) else: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command missing 'insertedIds'.", raw_response=io_response, ) def _prepare_keys_from_status( self, status: dict[str, Any] | None, raise_on_missing: bool = False ) -> tuple[list[dict[str, Any]], list[tuple[Any, ...]]]: ids: list[dict[str, Any]] id_tuples: list[tuple[Any, ...]] if status is None: if raise_on_missing: raise UnexpectedDataAPIResponseException( text="'status' not found in API response", raw_response=None, ) else: ids = [] id_tuples = [] else: if "primaryKeySchema" not in status: raise UnexpectedDataAPIResponseException( text=( "received a 'status' without 'primaryKeySchema' " f"in API response (received: {status})" ), raw_response=None, ) if "insertedIds" not in status: raise UnexpectedDataAPIResponseException( text=( "received a 'status' without 'insertedIds' " f"in API response (received: {status})" ), raw_response=None, ) primary_key_schema = status["primaryKeySchema"] id_tuples_and_ids = self._converter_agent.postprocess_keys( status["insertedIds"], primary_key_schema_dict=primary_key_schema, ) id_tuples = [tpl for tpl, _ in id_tuples_and_ids] ids = [id for _, id in id_tuples_and_ids] return ids, id_tuples def insert_many( self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertManyResult: """ Insert a number of rows into the table, with implied overwrite in case of primary key collision. Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: rows: an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster. chunk_size: how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples. Examples: >>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... ) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> my_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... ) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ... Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important. Note: If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the `ordered` parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of type `astrapy.exceptions.TableInsertManyException`: its `partial_result` attribute is precisely a `TableInsertManyResult`, encoding details on the successful writes. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _rows = list(rows) logger.info(f"inserting {len(_rows)} rows in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] inserted_id_tuples: list[Any] = [] for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany on '{self.name}'") chunk_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany on '{self.name}'") # accumulate the results in this call chunk_inserted_ids, chunk_inserted_ids_tuples = ( self._prepare_keys_from_status(chunk_response.get("status")) ) inserted_ids += chunk_inserted_ids inserted_id_tuples += chunk_inserted_ids_tuples raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} if _concurrency > 1: with ThreadPoolExecutor(max_workers=_concurrency) as executor: def _chunk_insertor( row_chunk: list[dict[str, Any]], ) -> dict[str, Any]: im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": row_chunk, "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response raw_results = list( executor.map( _chunk_insertor, ( _rows[i : i + _chunk_size] for i in range(0, len(_rows), _chunk_size) ), ) ) else: for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") raw_results.append(im_response) # recast raw_results. Each response has its schema: unfold appropriately ids_and_tuples_per_chunk = [ self._prepare_keys_from_status(chunk_response.get("status")) for chunk_response in raw_results ] inserted_ids = [ inserted_id for chunk_ids, _ in ids_and_tuples_per_chunk for inserted_id in chunk_ids ] inserted_id_tuples = [ inserted_id_tuple for _, chunk_id_tuples in ids_and_tuples_per_chunk for inserted_id_tuple in chunk_id_tuples ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableFindCursor[ROW, ROW]: ... @overload def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2], skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableFindCursor[ROW, ROW2]: ... def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableFindCursor[ROW, ROW2]: """ Find rows on the table matching the provided filters and according to sorting criteria including vector similarity. The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the `TableFindCursor` documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism. Invoking `.to_list()` on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large. Args: filter: a dictionary expressing which condition the returned rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter, not recommended for large tables), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching rows. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly a `TableFindCursor[ROW, ROW]`, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: if provided, it is a number of rows that would be obtained first in the response and are instead skipped. limit: a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned row. It can be used meaningfully only in a vector search (see `sort`). include_sort_vector: a boolean to request the search query vector. If set to True (and if the search is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: this dictionary parameter controls the order in which the rows are returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `request_timeout_ms`. Returns: a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed. Note: As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time. Examples: >>> # Iterate over results: >>> for row in my_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> for row in my_table.find({"match_id": "challenge6"}, projection=proj): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Filter on the partitioning: >>> my_table.find({"match_id": "challenge6"}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> my_table.find({"match_id": "challenge6", "round": 1}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find({"winner": "Caio Gozer"}).to_list() [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find({"score": {"$gte": 15}}).to_list() [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> my_table.find({}).to_list() The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find({"round": 3, "winner": "Caio Gozer"}).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list() [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and `limit`: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(winner_cursor.map(lambda row: row["winner"].upper()))) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_cursor = my_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> matches_cursor.has_next() True >>> next(matches_cursor) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_cursor.consumed 1 >>> matches_cursor.rewind() >>> matches_cursor.consumed 0 >>> matches_cursor.has_next() True >>> matches_cursor.close() >>> try: ... next(matches_cursor) ... except StopIteration: ... print("StopIteration triggered.") ... StopIteration triggered. """ # lazy-import here to avoid circular import issues from astrapy.cursors import TableFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( TableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) ) def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ROW | None: """ Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none. The parameters are analogous to some of the parameters to the `find` method (which has a few more that do not make sense in this case, such as `limit`). Args: filter: a dictionary expressing which condition the returned row must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching row. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned row. It can be used meaningfully only in a vector search (see `sort`). sort: this dictionary parameter controls the sorting order, hence determines which row is being returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the result if a row is found, otherwise None. Examples: >>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> my_table.find_one({"match_id": "challenge6"}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(my_table.find_one({"match_id": "not_real"})) 'None' >>> >>> # Optimize bandwidth using a projection: >>> my_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... ) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> my_table.find_one({"match_id": "challenge6", "round": 1}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find_one({"winner": "Caio Gozer"}) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find_one({"score": {"$gte": 15}}) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> my_table.find_one({}) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find_one({"round": 3, "winner": "Caio Gozer"}) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... ) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = self._converter_agent.preprocess_payload( { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } ) fo_response = self._api_commander.request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'document'.", raw_response=fo_response, ) if "projectionSchema" not in (fo_response.get("status") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'projectionSchema'.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return self._converter_agent.postprocess_row( fo_response["data"]["document"], columns_dict=fo_response["status"]["projectionSchema"], similarity_pseudocolumn="$similarity" if include_similarity else None, ) def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the rows in the table that match the provided filter. Args: key: the name of the field whose value is inspected across rows. Keys are typically just column names, although they can use the dot notation to select particular entries in map columns. For set and list columns, individual entries are "unrolled" automatically; in particular, for lists, numeric indices can be used in the key dot-notation syntax. Example of acceptable `key` values: "a_column" "map_column.map_key" "list_column.2" filter: a dictionary expressing which condition the inspected rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the rows that match the filter. The result list has no repeated items. Examples: >>> my_table.distinct("winner", filter={"match_id": "challenge6"}) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> my_table.distinct("winner") The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> my_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> my_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-... Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required rows using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import TableFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_shallow_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: TableFindCursor[dict[str, Any], dict[str, Any]] = ( TableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the row in the table matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching rows. Examples: >>> my_table.insert_many([{"seq": i} for i in range(20)]) TableInsertManyResult(...) >>> my_table.count_documents({}, upper_bound=100) 20 >>> my_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_table.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = self._api_commander.request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyRowsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyRowsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, ) def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the table. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the table. Example: >>> my_table.estimated_document_count() 5820 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = self._api_commander.request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, ) def update_one( self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. An example may be `{"match_id": "fight4", "round": 1}`. update: the update prescription to apply to the row, expressed as a dictionary conforming to the Data API syntax. The update operators for tables are `$set` and `$unset` (in particular, setting a column to None has the same effect as the $unset operator). Examples are `{"$set": {"round": 12}}` and `{"$unset": {"winner": "", "score": ""}}`. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> my_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> my_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... ) Note: a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": self._converter_agent.preprocess_payload(update), }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = self._api_commander.request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: # the contents are disregarded and the method just returns: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, ) def delete_one( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. A row (at most one) is deleted if it matches that primary key. An example filter may be `{"match_id": "fight4", "round": 1}`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # Count the rows matching a certain filter >>> len(my_table.find({"match_id": "fight7"}).to_list()) 3 >>> >>> # Delete a row belonging to the group >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # Count again >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # The count is unchanged >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = self._converter_agent.preprocess_payload( { "deleteOne": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteOne on '{self.name}'") do_response = self._api_commander.request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if do_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteOne API command.", raw_response=do_response, ) def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table. Args: filter: a filter dictionary to specify which row(s) must be deleted. 1. If the filter is in the form `{"pk1": val1, "pk2": val2 ...}` and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter, `{}`, is passed, this operation empties the table completely. *USE WITH CARE*. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: - `{"pa1": x, "pa2": y, "ps1": z, "ps2": t}`: deletes one row - `{"pa1": x, "pa2": y, "ps1": z}`: deletes multiple rows - `{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}`: del. multiple rows - `{"pa1": x, "pa2": y}`: deletes all rows in the partition - `{}`: empties the table (*CAUTION*) Invalid filter examples: - `{"pa1": x}`: incomplete partition key - `{"pa1": x, "ps1" z}`: incomplete partition key (whatever is added) - `{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}`: inequality on a non-least-significant partitionSort column provided. - `{"pa1": x, "pa2": y, "ps2": t}`: cannot skip "ps1" general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # Delete a single row (full primary key specified): >>> my_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> my_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> my_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> my_table.delete_many({}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) dm_payload = self._converter_agent.preprocess_payload( { "deleteMany": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteMany on '{self.name}'") dm_response = self._api_commander.request( payload=dm_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteMany on '{self.name}'") if dm_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteMany API command.", raw_response=dm_response, ) def drop( self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the table, i.e. delete it from the database along with all the rows stored therein. Args: if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # List tables: >>> my_table.database.list_table_names() ['games'] >>> >>> # Drop this table: >>> my_table.drop() >>> >>> # List tables again: >>> my_table.database.list_table_names() [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... Collection does not exist [...] (COLLECTION_NOT_EXIST) Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping table '{self.name}' (self)") self.database.drop_table( self.name, if_exists=if_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping table '{self.name}' (self)") def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... }) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = self._api_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
Ancestors
- typing.Generic
Instance variables
var database : Database
-
a Database object, the database this table belongs to.
Example
>>> my_table.database.name 'the_db'
Expand source code
@property def database(self) -> Database: """ a Database object, the database this table belongs to. Example: >>> my_table.database.name 'the_db' """ return self._database
var full_name : str
-
The fully-qualified table name within the database, in the form "keyspace.table_name".
Example
>>> my_table.full_name 'default_keyspace.my_table'
Expand source code
@property def full_name(self) -> str: """ The fully-qualified table name within the database, in the form "keyspace.table_name". Example: >>> my_table.full_name 'default_keyspace.my_table' """ return f"{self.keyspace}.{self.name}"
var keyspace : str
-
The keyspace this table is in.
Example
>>> my_table.keyspace 'default_keyspace'
Expand source code
@property def keyspace(self) -> str: """ The keyspace this table is in. Example: >>> my_table.keyspace 'default_keyspace' """ _keyspace = self.database.keyspace if _keyspace is None: raise ValueError("The table's DB is set with keyspace=None") return _keyspace
var name : str
-
The name of this table.
Example
>>> my_table.name 'games'
Expand source code
@property def name(self) -> str: """ The name of this table. Example: >>> my_table.name 'games' """ return self._name
Methods
def alter(self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = dict[str, typing.Any], table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Table[~NEW_ROW]
-
Executes one of the available alter-table operations on this table, such as adding/dropping columns.
This is a blocking operation: the method returns once the index is created and ready to use.
Args
operation
- an instance of one of the
astrapy.info.AlterTable*
classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": …}}. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting Table is implicitly a
Table[dict[str, Any]]
. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Examples
>>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import Table >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: Table[MyMatch] = new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... )
Expand source code
def alter( self, operation: AlterTableOperation | dict[str, Any], *, row_type: type[Any] = DefaultRowType, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> Table[NEW_ROW]: """ Executes one of the available alter-table operations on this table, such as adding/dropping columns. This is a blocking operation: the method returns once the index is created and ready to use. Args: operation: an instance of one of the `astrapy.info.AlterTable*` classes, representing which alter operation to perform and the details thereof. A regular dictionary can also be provided, but then it must have the alter operation name at its top level: {"add": {"columns": ...}}. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting Table is implicitly a `Table[dict[str, Any]]`. If provided, it must match the type hint specified in the assignment. See the examples below. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> from astrapy.info import ( ... AlterTableAddColumns, ... AlterTableAddVectorize, ... AlterTableDropColumns, ... AlterTableDropVectorize, ... ColumnType, ... TableScalarColumnTypeDescriptor, ... VectorServiceOptions, ... ) >>> >>> # Add a column >>> new_table_1 = my_table.alter( ... AlterTableAddColumns( ... columns={ ... "tie_break": TableScalarColumnTypeDescriptor( ... column_type=ColumnType.BOOLEAN, ... ), ... } ... ) ... ) >>> >>> # Drop a column >>> new_table_2 = new_table_1.alter(AlterTableDropColumns( ... columns=["tie_break"] ... )) >>> >>> # Add vectorize to a (vector) column >>> new_table_3 = new_table_2.alter( ... AlterTableAddVectorize( ... columns={ ... "m_vector": VectorServiceOptions( ... provider="openai", ... model_name="text-embedding-3-small", ... authentication={ ... "providerKey": "ASTRA_KMS_API_KEY_NAME", ... }, ... ), ... } ... ) ... ) >>> >>> # Drop vectorize from a (vector) column >>> # (Also demonstrates type hint usage) >>> from typing import TypedDict >>> from astrapy import Table >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> class MyMatch(TypedDict): ... match_id: str ... round: int ... m_vector: DataAPIVector ... score: int ... when: DataAPITimestamp ... winner: str ... fighters: DataAPISet[UUID] ... >>> new_table_4: Table[MyMatch] = new_table_3.alter( ... AlterTableDropVectorize(columns=["m_vector"]), ... row_type=MyMatch, ... ) """ n_operation: AlterTableOperation if isinstance(operation, AlterTableOperation): n_operation = operation else: n_operation = AlterTableOperation.from_full_dict(operation) _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) at_operation_name = n_operation._name at_payload = { "alterTable": { "operation": { at_operation_name: n_operation.as_dict(), }, }, } logger.info(f"alterTable({at_operation_name})") at_response = self._api_commander.request( payload=at_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if at_response.get("status") != {"ok": 1}: raise UnexpectedDataAPIResponseException( text="Faulty response from alterTable API command.", raw_response=at_response, ) logger.info(f"finished alterTable({at_operation_name})") return Table( database=self.database, name=self.name, keyspace=self.keyspace, api_options=self.api_options, )
def command(self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> dict[str, typing.Any]
-
Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload.
Args
body
- a JSON-serializable dictionary, the payload of the request.
raise_api_errors
- if True, responses with a nonempty 'errors' field result in an astrapy exception being raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary with the response of the HTTP request.
Example
>>> my_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... }) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened
Expand source code
def command( self, body: dict[str, Any] | None, *, raise_api_errors: bool = True, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> dict[str, Any]: """ Send a POST request to the Data API for this table with an arbitrary, caller-provided payload. No transformations or type conversions are made on the provided payload. Args: body: a JSON-serializable dictionary, the payload of the request. raise_api_errors: if True, responses with a nonempty 'errors' field result in an astrapy exception being raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary with the response of the HTTP request. Example: >>> my_table.command({ ... "findOne": { ... "filter": {"match_id": "fight4"}, ... "projection": {"winner": True}, ... } ... }) {'data': {'document': {'winner': 'Victor'}}, 'status': ... # shortened """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) _cmd_desc: str if body: _cmd_desc = ",".join(sorted(body.keys())) else: _cmd_desc = "(none)" logger.info(f"command={_cmd_desc} on '{self.name}'") command_result = self._api_commander.request( payload=body, raise_api_errors=raise_api_errors, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished command={_cmd_desc} on '{self.name}'") return command_result
def count_documents(self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Count the row in the table matching the specified filter.
Args
filter
- a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators.
upper_bound
- a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
the exact count of matching rows.
Examples
>>> my_table.insert_many([{"seq": i} for i in range(20)]) TableInsertManyResult(...) >>> my_table.count_documents({}, upper_bound=100) 20 >>> my_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_table.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException
Note
Count operations are expensive: for this reason, the best practice is to provide a reasonable
upper_bound
according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered.Expand source code
def count_documents( self, filter: FilterType, *, upper_bound: int, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Count the row in the table matching the specified filter. Args: filter: a predicate expressed as a dictionary according to the Data API filter syntax. Examples are: {} {"name": "John"} {"name": "John", "age": 59} {"$and": [{"name": {"$eq": "John"}}, {"age": {"$gt": 58}}]} See the Data API documentation for the full set of operators. upper_bound: a required ceiling on the result of the count operation. If the actual number of rows exceeds this value, an exception will be raised. Furthermore, if the actual number of rows exceeds the maximum count that the Data API can reach (regardless of upper_bound), an exception will be raised. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: the exact count of matching rows. Examples: >>> my_table.insert_many([{"seq": i} for i in range(20)]) TableInsertManyResult(...) >>> my_table.count_documents({}, upper_bound=100) 20 >>> my_table.count_documents({"seq":{"$gt": 15}}, upper_bound=100) 4 >>> my_table.count_documents({}, upper_bound=10) Traceback (most recent call last): ... ... astrapy.exceptions.TooManyRowsToCountException Note: Count operations are expensive: for this reason, the best practice is to provide a reasonable `upper_bound` according to the caller expectations. Moreover, indiscriminate usage of count operations for sizeable amounts of rows (i.e. in the thousands and more) is discouraged in favor of alternative application-specific solutions. Keep in mind that the Data API has a hard upper limit on the amount of rows it will count, and that an exception will be thrown by this method if this limit is encountered. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) cd_payload = {"countDocuments": {"filter": filter}} logger.info(f"countDocuments on '{self.name}'") cd_response = self._api_commander.request( payload=cd_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished countDocuments on '{self.name}'") if "count" in cd_response.get("status", {}): count: int = cd_response["status"]["count"] if cd_response["status"].get("moreData", False): raise TooManyRowsToCountException( text=f"Document count exceeds {count}, the maximum allowed by the server", server_max_count_exceeded=True, ) else: if count > upper_bound: raise TooManyRowsToCountException( text="Document count exceeds required upper bound", server_max_count_exceeded=False, ) else: return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from countDocuments API command.", raw_response=cd_response, )
def create_index(self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Create an index on a non-vector column of the table.
This is a blocking operation: the method returns once the index is created and ready to use.
For creation of a vector index, see method
create_vector_index
instead.Args
name
- the name of the index. Index names must be unique across the keyspace.
column
- the table column on which the index is to be created.
options
- if passed, it must be an instance of
TableIndexOptions
, or an equivalent dictionary, which specifies index settings such as – for a text column – case-sensitivity and so on. See theTableIndexOptions
class for more details. if_not_exists
- if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Examples
>>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> my_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> my_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... )
Expand source code
def create_index( self, name: str, *, column: str, options: TableIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create an index on a non-vector column of the table. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a vector index, see method `create_vector_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column on which the index is to be created. options: if passed, it must be an instance of `TableIndexOptions`, or an equivalent dictionary, which specifies index settings such as -- for a text column -- case-sensitivity and so on. See the `astrapy.info.TableIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Examples: >>> from astrapy.info import TableIndexOptions >>> >>> # create an index on a column >>> my_table.create_index( ... "score_index", ... column="score", ... ) >>> >>> # create an index on a textual column, specifying indexing options >>> my_table.create_index( ... "winner_index", ... column="winner", ... options=TableIndexOptions( ... ascii=False, ... normalize=True, ... case_sensitive=False, ... ), ... ) """ ci_definition: dict[str, Any] = TableIndexDefinition( column=column, options=TableIndexOptions.coerce(options or {}), ).as_dict() ci_command = "createIndex" return self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, )
def create_vector_index(self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Create a vector index on a vector column of the table, enabling vector similarity search operations on it.
This is a blocking operation: the method returns once the index is created and ready to use.
For creation of a non-vector index, see method
create_index
instead.Args
name
- the name of the index. Index names must be unique across the keyspace.
column
- the table column, of type "vector" on which to create the index.
options
- an instance of
TableVectorIndexOptions
, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See theTableVectorIndexOptions
class for more details. if_not_exists
- if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of <code>if\_not\_exists</code>): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of <code>if\_not\_exists</code>): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... )
Expand source code
def create_vector_index( self, name: str, *, column: str, options: TableVectorIndexOptions | dict[str, Any] | None = None, if_not_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Create a vector index on a vector column of the table, enabling vector similarity search operations on it. This is a blocking operation: the method returns once the index is created and ready to use. For creation of a non-vector index, see method `create_index` instead. Args: name: the name of the index. Index names must be unique across the keyspace. column: the table column, of type "vector" on which to create the index. options: an instance of `TableVectorIndexOptions`, or an equivalent dictionary, which specifies settings for the vector index, such as the metric to use or, if desired, a "source model" setting. If omitted, the Data API defaults will apply for the index. See the `astrapy.info.TableVectorIndexOptions` class for more details. if_not_exists: if set to True, the command will succeed even if an index with the specified name already exists (in which case no actual index creation takes place on the database). The API default of False means that an error is raised by the API in case of name collision. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> from astrapy.constants import VectorMetric >>> from astrapy.info import TableVectorIndexOptions >>> >>> # create a vector index with dot-product similarity >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... ), ... ) >>> # specify a source_model (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... options=TableVectorIndexOptions( ... metric=VectorMetric.DOT_PRODUCT, ... source_model="nv-qa-4", ... ), ... if_not_exists=True, ... ) >>> # leave the settings to the Data API defaults of cosine >>> # similarity metric (since the previous statement >>> # succeeded, this will do nothing because of `if_not_exists`): >>> my_table.create_vector_index( ... "m_vector_index", ... column="m_vector", ... if_not_exists=True, ... ) """ ci_definition: dict[str, Any] = TableVectorIndexDefinition( column=column, options=TableVectorIndexOptions.coerce(options), ).as_dict() ci_command = "createVectorIndex" return self._create_generic_index( i_name=name, ci_definition=ci_definition, ci_command=ci_command, if_not_exists=if_not_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, )
def definition(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> ListTableDefinition
-
Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a
create_table
method call.Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
A
ListTableDefinition
object, available for inspection.Example
>>> my_table.definition() ListTableDefinition(columns=[match_id,round,fighters, ... # shortened
Expand source code
def definition( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ListTableDefinition: """ Query the Data API and return a structure defining the table schema. If there are no unsupported colums in the table, the return value has the same contents as could have been provided to a `create_table` method call. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: A `ListTableDefinition` object, available for inspection. Example: >>> my_table.definition() ListTableDefinition(columns=[match_id,round,fighters, ... # shortened """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"getting tables in search of '{self.name}'") self_descriptors = [ table_desc for table_desc in self.database._list_tables_ctx( keyspace=None, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label, ), ) if table_desc.name == self.name ] logger.info(f"finished getting tables in search of '{self.name}'") if self_descriptors: return self_descriptors[0].definition else: raise ValueError( f"Table {self.keyspace}.{self.name} not found.", )
def delete_many(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table.
Args
filter
- a filter dictionary to specify which row(s) must be deleted.
1. If the filter is in the form
{"pk1": val1, "pk2": val2 ...}
and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter,{}
, is passed, this operation empties the table completely. USE WITH CARE. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: -{"pa1": x, "pa2": y, "ps1": z, "ps2": t}
: deletes one row -{"pa1": x, "pa2": y, "ps1": z}
: deletes multiple rows -{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}
: del. multiple rows -{"pa1": x, "pa2": y}
: deletes all rows in the partition -{}
: empties the table (CAUTION) Invalid filter examples: -{"pa1": x}
: incomplete partition key -{"pa1": x, "ps1" z}
: incomplete partition key (whatever is added) -{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}
: inequality on a non-least-significant partitionSort column provided. -{"pa1": x, "pa2": y, "ps2": t}
: cannot skip "ps1" general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> # Delete a single row (full primary key specified): >>> my_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> my_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> my_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> my_table.delete_many({})
Expand source code
def delete_many( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete all rows matching a provided filter condition. This operation can target from a single row to the entirety of the table. Args: filter: a filter dictionary to specify which row(s) must be deleted. 1. If the filter is in the form `{"pk1": val1, "pk2": val2 ...}` and specified the primary key in full, at most one row is deleted, the one with that primary key. 2. If the table has "partitionSort" columns, some or all of them may be left out (the least significant of them can also employ an inequality, or range, predicate): a range of rows, but always within a single partition, will be deleted. 3. If an empty filter, `{}`, is passed, this operation empties the table completely. *USE WITH CARE*. 4. Other kinds of filtering clauses are forbidden. In the following examples, the table is partitioned by columns ["pa1", "pa2"] and has partitionSort "ps1" and "ps2" in that order. Valid filter examples: - `{"pa1": x, "pa2": y, "ps1": z, "ps2": t}`: deletes one row - `{"pa1": x, "pa2": y, "ps1": z}`: deletes multiple rows - `{"pa1": x, "pa2": y, "ps1": z, "ps2": {"$lt": q}}`: del. multiple rows - `{"pa1": x, "pa2": y}`: deletes all rows in the partition - `{}`: empties the table (*CAUTION*) Invalid filter examples: - `{"pa1": x}`: incomplete partition key - `{"pa1": x, "ps1" z}`: incomplete partition key (whatever is added) - `{"pa1": x, "pa2": y, "ps1": {"$lt": r}, "ps2": t}`: inequality on a non-least-significant partitionSort column provided. - `{"pa1": x, "pa2": y, "ps2": t}`: cannot skip "ps1" general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # Delete a single row (full primary key specified): >>> my_table.delete_many({"match_id": "fight4", "round": 1}) >>> >>> # Delete part of a partition (inequality on the >>> # last-mentioned 'partitionSort' column): >>> my_table.delete_many({"match_id": "fight5", "round": {"$gte": 5}}) >>> >>> # Delete a whole partition (leave 'partitionSort' unspecified): >>> my_table.delete_many({"match_id": "fight7"}) >>> >>> # empty the table entirely with empty filter (*CAUTION*): >>> my_table.delete_many({}) """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) dm_payload = self._converter_agent.preprocess_payload( { "deleteMany": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteMany on '{self.name}'") dm_response = self._api_commander.request( payload=dm_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteMany on '{self.name}'") if dm_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteMany API command.", raw_response=dm_response, )
def delete_one(self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing.
Args
filter
- a predicate expressing the table primary key in full,
i.e. a dictionary defining values for all columns that form the
primary key. A row (at most one) is deleted if it matches that primary
key. An example filter may be
{"match_id": "fight4", "round": 1}
. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> # Count the rows matching a certain filter >>> len(my_table.find({"match_id": "fight7"}).to_list()) 3 >>> >>> # Delete a row belonging to the group >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # Count again >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # The count is unchanged >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2
Expand source code
def delete_one( self, filter: FilterType, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Delete a row, matching the provided value of the primary key. If no row is found with that primary key, the method does nothing. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. A row (at most one) is deleted if it matches that primary key. An example filter may be `{"match_id": "fight4", "round": 1}`. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> # Count the rows matching a certain filter >>> len(my_table.find({"match_id": "fight7"}).to_list()) 3 >>> >>> # Delete a row belonging to the group >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # Count again >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2 >>> >>> # Attempt the delete again (nothing to delete) >>> my_table.delete_one({"match_id": "fight7", "round": 2}) >>> >>> # The count is unchanged >>> len(my_table.find({"match_id": "fight7"}).to_list()) 2 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) do_payload = self._converter_agent.preprocess_payload( { "deleteOne": { k: v for k, v in { "filter": filter, }.items() if v is not None } } ) logger.info(f"deleteOne on '{self.name}'") do_response = self._api_commander.request( payload=do_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished deleteOne on '{self.name}'") if do_response.get("status", {}).get("deletedCount") == -1: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from deleteOne API command.", raw_response=do_response, )
def distinct(self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[typing.Any]
-
Return a list of the unique values of
key
across the rows in the table that match the provided filter.Args
key
- the name of the field whose value is inspected across rows.
Keys are typically just column names, although they can use
the dot notation to select particular entries in map columns.
For set and list columns, individual entries are "unrolled"
automatically; in particular, for lists, numeric indices
can be used in the key dot-notation syntax.
Example of acceptable
key
values: "a_column" "map_column.map_key" "list_column.2" filter
- a dictionary expressing which condition the inspected rows
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms
- a timeout, in milliseconds, for the whole
requested operation (which may involve multiple API requests).
This method, being based on
find
(see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms
- a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a list of all different values for
key
found across the rows that match the filter. The result list has no repeated items.Examples
>>> my_table.distinct("winner", filter={"match_id": "challenge6"}) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> my_table.distinct("winner") The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> my_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> my_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-...
Note
It must be kept in mind that
distinct
is a client-side operation, which effectively browses all required rows using the logic of thefind
method and collects the unique values found forkey
. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large.Note
For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the
find
command.Expand source code
def distinct( self, key: str, *, filter: FilterType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[Any]: """ Return a list of the unique values of `key` across the rows in the table that match the provided filter. Args: key: the name of the field whose value is inspected across rows. Keys are typically just column names, although they can use the dot notation to select particular entries in map columns. For set and list columns, individual entries are "unrolled" automatically; in particular, for lists, numeric indices can be used in the key dot-notation syntax. Example of acceptable `key` values: "a_column" "map_column.map_key" "list_column.2" filter: a dictionary expressing which condition the inspected rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. general_method_timeout_ms: a timeout, in milliseconds, for the whole requested operation (which may involve multiple API requests). This method, being based on `find` (see) may entail successive HTTP API requests, depending on the amount of involved rows. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, for each API request. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a list of all different values for `key` found across the rows that match the filter. The result list has no repeated items. Examples: >>> my_table.distinct("winner", filter={"match_id": "challenge6"}) ['Donna', 'Erick', 'Fiona'] >>> >>> # distinct values across the whole table: >>> # (not recommended performance-wise) >>> my_table.distinct("winner") The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... ['Victor', 'Adam Zuul', 'Betta Vigo', 'Caio Gozer', 'Donna', 'Erick', ... >>> >>> # Over a column containing null values >>> # (also with composite filter): >>> my_table.distinct( ... "score", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [18, None] >>> >>> # distinct over a set column (automatically "unrolled"): >>> my_table.distinct( ... "fighters", ... filter={"match_id": {"$in": ["fight4", "tournamentA"]}}, ... ) [UUID('0193539a-2770-8c09-a32a-111111111111'), UUID('019353e3-00b4-... Note: It must be kept in mind that `distinct` is a client-side operation, which effectively browses all required rows using the logic of the `find` method and collects the unique values found for `key`. As such, there may be performance, latency and ultimately billing implications if the amount of matching rows is large. Note: For details on the behaviour of "distinct" in conjunction with real-time changes in the table contents, see the Note of the `find` command. """ # lazy-import here to avoid circular import issues from astrapy.cursors import TableFindCursor _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) # preparing cursor: _extractor = _create_document_key_extractor(key) _key = _reduce_distinct_key_to_shallow_safe(key) if _key == "": raise ValueError( "The 'key' parameter for distinct cannot be empty " "or start with a list index." ) # relaxing the type hint (limited to within this method body) f_cursor: TableFindCursor[dict[str, Any], dict[str, Any]] = ( TableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=_general_method_timeout_ms, request_timeout_label=_rt_label, overall_timeout_label=_gmt_label, ) # type: ignore[assignment] .filter(filter) .project({_key: True}) ) # consuming it: _item_hashes = set() distinct_items: list[Any] = [] logger.info(f"running distinct() on '{self.name}'") for document in f_cursor: for item in _extractor(document): _item_hash = _hash_document( item, options=self.api_options.serdes_options ) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) logger.info(f"finished running distinct() on '{self.name}'") return distinct_items
def drop(self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Drop the table, i.e. delete it from the database along with all the rows stored therein.
Args
if_exists
- if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold.
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Example
>>> # List tables: >>> my_table.database.list_table_names() ['games'] >>> >>> # Drop this table: >>> my_table.drop() >>> >>> # List tables again: >>> my_table.database.list_table_names() [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... Collection does not exist [...] (COLLECTION_NOT_EXIST)
Note
Use with caution.
Note
Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further.
Expand source code
def drop( self, *, if_exists: bool | None = None, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Drop the table, i.e. delete it from the database along with all the rows stored therein. Args: if_exists: if passed as True, trying to drop a non-existing table will not error, just silently do nothing instead. If not provided, the API default behaviour will hold. table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Example: >>> # List tables: >>> my_table.database.list_table_names() ['games'] >>> >>> # Drop this table: >>> my_table.drop() >>> >>> # List tables again: >>> my_table.database.list_table_names() [] >>> >>> # Try working on the table now: >>> from astrapy.exceptions import DataAPIResponseException >>> try: ... my_table.find_one({}) ... except DataAPIResponseException as err: ... print(str(err)) ... Collection does not exist [...] (COLLECTION_NOT_EXIST) Note: Use with caution. Note: Once the method succeeds, methods on this object can still be invoked: however, this hardly makes sense as the underlying actual table is no more. It is responsibility of the developer to design a correct flow which avoids using a deceased collection any further. """ logger.info(f"dropping table '{self.name}' (self)") self.database.drop_table( self.name, if_exists=if_exists, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) logger.info(f"finished dropping table '{self.name}' (self)")
def estimated_document_count(self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> int
-
Query the API server for an estimate of the document count in the table.
Contrary to
count_documents
, this method has no filtering parameters.Args
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a server-provided estimate count of the documents in the table.
Example
>>> my_table.estimated_document_count() 5820
Expand source code
def estimated_document_count( self, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> int: """ Query the API server for an estimate of the document count in the table. Contrary to `count_documents`, this method has no filtering parameters. Args: general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a server-provided estimate count of the documents in the table. Example: >>> my_table.estimated_document_count() 5820 """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) ed_payload: dict[str, Any] = {"estimatedDocumentCount": {}} logger.info(f"estimatedDocumentCount on '{self.name}'") ed_response = self._api_commander.request( payload=ed_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished estimatedDocumentCount on '{self.name}'") if "count" in ed_response.get("status", {}): count: int = ed_response["status"]["count"] return count else: raise UnexpectedDataAPIResponseException( text="Faulty response from estimatedDocumentCount API command.", raw_response=ed_response, )
def find(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableFindCursor[ROW, ROW2]
-
Find rows on the table matching the provided filters and according to sorting criteria including vector similarity.
The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the
TableFindCursor
documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism.Invoking
.to_list()
on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large.Args
filter
- a dictionary expressing which condition the returned rows
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter, not recommended for large tables),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection
- a prescription on which columns to return for the matching rows.
The projection can take the form
{"column1": True, "column2": True}
.{"*": True}
(i.e. return the whole row), or the complementary form that excludes columns:{"column1": False, "column2": False}
. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type
- this parameter acts a formal specifier for the type checker.
If omitted, the resulting cursor is implicitly a
TableFindCursor[ROW, ROW]
, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip
- if provided, it is a number of rows that would be obtained first in the response and are instead skipped.
limit
- a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table.
include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in each returned
row. It can be used meaningfully only in a vector search (see
sort
). include_sort_vector
- a boolean to request the search query vector.
If set to True (and if the search is a vector search), calling
the
get_sort_vector
method on the returned cursor will yield the vector used for the ANN search. sort
- this dictionary parameter controls the order in which the rows
are returned. The sort parameter can express either a vector search or
a regular (ascending/descending, even hierarchical) sorting.
* For a vector search the parameter takes the form
{"vector_column": qv}
, with the query vectorqv
of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form{"vectorize_enabled_column": "query text"}
is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are{"score": SortMode.ASCENDING}
(equivalently{"score": +1}
),{"score": +1, "when": -1}
. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms
- a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply.
timeout_ms
- an alias for
request_timeout_ms
.
Returns
a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed.
Note
As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time.
Examples
>>> # Iterate over results: >>> for row in my_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> for row in my_table.find({"match_id": "challenge6"}, projection=proj): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Filter on the partitioning: >>> my_table.find({"match_id": "challenge6"}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> my_table.find({"match_id": "challenge6", "round": 1}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find({"winner": "Caio Gozer"}).to_list() [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find({"score": {"$gte": 15}}).to_list() [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> my_table.find({}).to_list() The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find({"round": 3, "winner": "Caio Gozer"}).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list() [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and <code>limit</code>: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(winner_cursor.map(lambda row: row["winner"].upper()))) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_cursor = my_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> matches_cursor.has_next() True >>> next(matches_cursor) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_cursor.consumed 1 >>> matches_cursor.rewind() >>> matches_cursor.consumed 0 >>> matches_cursor.has_next() True >>> matches_cursor.close() >>> try: ... next(matches_cursor) ... except StopIteration: ... print("StopIteration triggered.") ... StopIteration triggered.
Expand source code
def find( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, row_type: type[ROW2] | None = None, skip: int | None = None, limit: int | None = None, include_similarity: bool | None = None, include_sort_vector: bool | None = None, sort: SortType | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableFindCursor[ROW, ROW2]: """ Find rows on the table matching the provided filters and according to sorting criteria including vector similarity. The returned TableFindCursor object, representing the stream of results, can be iterated over, or consumed and manipulated in several other ways (see the examples below and the `TableFindCursor` documentation for details). Since the amount of returned items can be large, TableFindCursor is a lazy object, that fetches new data while it is being read using the Data API pagination mechanism. Invoking `.to_list()` on a TableFindCursor will cause it to consume all rows and materialize the entire result set as a list. This is not recommended if the amount of results is very large. Args: filter: a dictionary expressing which condition the returned rows must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter, not recommended for large tables), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching rows. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. row_type: this parameter acts a formal specifier for the type checker. If omitted, the resulting cursor is implicitly a `TableFindCursor[ROW, ROW]`, i.e. maintains the same type for the items it returns as that for the rows in the table. Strictly typed code may want to specify this parameter especially when a projection is given. skip: if provided, it is a number of rows that would be obtained first in the response and are instead skipped. limit: a maximum amount of rows to get from the table. The returned cursor will stop yielding rows when either this number is reached or there really are no more matches in the table. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in each returned row. It can be used meaningfully only in a vector search (see `sort`). include_sort_vector: a boolean to request the search query vector. If set to True (and if the search is a vector search), calling the `get_sort_vector` method on the returned cursor will yield the vector used for the ANN search. sort: this dictionary parameter controls the order in which the rows are returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications and limitations on the amount of items returned. Consult the Data API documentation for more details on this topic. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `request_timeout_ms`. Returns: a TableFindCursor object, that can be iterated over (and manipulated in several ways), that if needed handles pagination under the hood as the rows are consumed. Note: As the rows are retrieved in chunks progressively, while the cursor is being iterated over, it is possible that the actual results obtained will reflect changes occurring to the table contents in real time. Examples: >>> # Iterate over results: >>> for row in my_table.find({"match_id": "challenge6"}): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Optimize bandwidth using a projection: >>> proj = {"round": True, "winner": True} >>> for row in my_table.find({"match_id": "challenge6"}, projection=proj): ... print(f"(R:{row['round']}): winner {row['winner']}") ... (R:1): winner Donna (R:2): winner Erick (R:3): winner Fiona >>> # Filter on the partitioning: >>> my_table.find({"match_id": "challenge6"}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on primary key: >>> my_table.find({"match_id": "challenge6", "round": 1}).to_list() [{'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find({"winner": "Caio Gozer"}).to_list() [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find({"score": {"$gte": 15}}).to_list() [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter (not recommended performance-wise): >>> my_table.find({}).to_list() The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... [{'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find({"round": 3, "winner": "Caio Gozer"}).to_list() The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... [{'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Donna'}, {'winner': 'Victor'}] >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... limit=3, ... ).to_list() [{'winner': 'Victor'}] >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... limit=3, ... include_similarity=True, ... ).to_list() [{'winner': 'Donna', '$similarity': 0.515}, {'winner': 'Victor', ... >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ).to_list() [{'winner': 'Caio Gozer'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `skip` and `limit`: >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... skip=1, ... limit=2, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Betta Vigo'}, {'winner': 'Adam Zuul'}] >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ).to_list() The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... [{'winner': 'Adam Zuul'}, {'winner': 'Betta Vigo'}, ... >>> >>> # Using `.map()` on a cursor: >>> winner_cursor = my_table.find( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... limit=5, ... ) >>> print("/".join(winner_cursor.map(lambda row: row["winner"].upper()))) CAIO GOZER/BETTA VIGO/ADAM ZUUL >>> >>> # Some other examples of cursor manipulation >>> matches_cursor = my_table.find( ... sort={"m_vector": DataAPIVector([-0.1, 0.15, 0.3])} ... ) >>> matches_cursor.has_next() True >>> next(matches_cursor) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> matches_cursor.consumed 1 >>> matches_cursor.rewind() >>> matches_cursor.consumed 0 >>> matches_cursor.has_next() True >>> matches_cursor.close() >>> try: ... next(matches_cursor) ... except StopIteration: ... print("StopIteration triggered.") ... StopIteration triggered. """ # lazy-import here to avoid circular import issues from astrapy.cursors import TableFindCursor _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (timeout_ms, "timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) return ( TableFindCursor( table=self, request_timeout_ms=_request_timeout_ms, overall_timeout_ms=None, request_timeout_label=_rt_label, ) .filter(filter) .project(projection) .skip(skip) .limit(limit) .sort(sort) .include_similarity(include_similarity) .include_sort_vector(include_sort_vector) )
def find_one(self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> Optional[~ROW]
-
Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none.
The parameters are analogous to some of the parameters to the
find
method (which has a few more that do not make sense in this case, such aslimit
).Args
filter
- a dictionary expressing which condition the returned row
must satisfy. The filter can use operators, such as "$eq" for equality,
and require columns to compare with literal values. Simple examples
are
{}
(zero filter),{"match_no": 123}
(a shorthand for{"match_no": {"$eq": 123}}
, or{"match_no": 123, "round": "C"}
(multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection
- a prescription on which columns to return for the matching row.
The projection can take the form
{"column1": True, "column2": True}
.{"*": True}
(i.e. return the whole row), or the complementary form that excludes columns:{"column1": False, "column2": False}
. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity
- a boolean to request the numeric value of the
similarity to be returned as an added "$similarity" key in the returned
row. It can be used meaningfully only in a vector search (see
sort
). sort
- this dictionary parameter controls the sorting order, hence determines
which row is being returned.
The sort parameter can express either a vector search or
a regular (ascending/descending, even hierarchical) sorting.
* For a vector search the parameter takes the form
{"vector_column": qv}
, with the query vectorqv
of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form{"vectorize_enabled_column": "query text"}
is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are{"score": SortMode.ASCENDING}
(equivalently{"score": +1}
),{"score": +1, "when": -1}
. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a dictionary expressing the result if a row is found, otherwise None.
Examples
>>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> my_table.find_one({"match_id": "challenge6"}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(my_table.find_one({"match_id": "not_real"})) 'None' >>> >>> # Optimize bandwidth using a projection: >>> my_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... ) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> my_table.find_one({"match_id": "challenge6", "round": 1}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find_one({"winner": "Caio Gozer"}) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find_one({"score": {"$gte": 15}}) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> my_table.find_one({}) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find_one({"round": 3, "winner": "Caio Gozer"}) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... ) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'}
Expand source code
def find_one( self, filter: FilterType | None = None, *, projection: ProjectionType | None = None, include_similarity: bool | None = None, sort: SortType | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> ROW | None: """ Run a search according to the given filtering and sorting criteria and return the top row matching it, or nothing if there are none. The parameters are analogous to some of the parameters to the `find` method (which has a few more that do not make sense in this case, such as `limit`). Args: filter: a dictionary expressing which condition the returned row must satisfy. The filter can use operators, such as "$eq" for equality, and require columns to compare with literal values. Simple examples are `{}` (zero filter), `{"match_no": 123}` (a shorthand for `{"match_no": {"$eq": 123}}`, or `{"match_no": 123, "round": "C"}` (multiple conditions are implicitly combined with "$and"). Please consult the Data API documentation for a more detailed explanation of table search filters and tips on their usage. projection: a prescription on which columns to return for the matching row. The projection can take the form `{"column1": True, "column2": True}`. `{"*": True}` (i.e. return the whole row), or the complementary form that excludes columns: `{"column1": False, "column2": False}`. To optimize bandwidth usage, it is recommended to use a projection, especially to avoid unnecessary columns of type vector with high-dimensional embeddings. include_similarity: a boolean to request the numeric value of the similarity to be returned as an added "$similarity" key in the returned row. It can be used meaningfully only in a vector search (see `sort`). sort: this dictionary parameter controls the sorting order, hence determines which row is being returned. The sort parameter can express either a vector search or a regular (ascending/descending, even hierarchical) sorting. * For a vector search the parameter takes the form `{"vector_column": qv}`, with the query vector `qv` of the appropriate type (list of floats or DataAPIVector). If the table has automatic embedding generation ("vectorize") enabled on that column, the form `{"vectorize_enabled_column": "query text"}` is also valid. * In the case of non-vector sorting, the parameter specifies the column(s) and the ascending/descending ordering required. If multiple columns are provided, the sorting applies them hierarchically to the rows. Examples are `{"score": SortMode.ASCENDING}` (equivalently `{"score": +1}`), `{"score": +1, "when": -1}`. Note that, depending on the column(s) chosen for sorting, the table partitioning structure, and the presence of indexes, the sorting may be done in-memory by the API. In that case, there may be performance implications. Consult the Data API documentation for more details on this topic. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a dictionary expressing the result if a row is found, otherwise None. Examples: >>> from astrapy.constants import SortMode >>> from astrapy.data_types import DataAPITimestamp, DataAPIVector >>> >>> # Filter on the partitioning: >>> my_table.find_one({"match_id": "challenge6"}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # A find with no matches: >>> str(my_table.find_one({"match_id": "not_real"})) 'None' >>> >>> # Optimize bandwidth using a projection: >>> my_table.find_one( ... {"match_id": "challenge6"}, ... projection={"round": True, "winner": True}, ... ) {'round': 1, 'winner': 'Donna'} >>> >>> # Filter on primary key: >>> my_table.find_one({"match_id": "challenge6", "round": 1}) {'match_id': 'challenge6', 'round': 1, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular indexed column: >>> my_table.find_one({"winner": "Caio Gozer"}) {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Non-equality filter on a regular indexed column: >>> my_table.find_one({"score": {"$gte": 15}}) {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"when": { ... "$gte": DataAPITimestamp.from_string("1999-12-31T01:23:44Z") ... }} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Empty filter: >>> my_table.find_one({}) The Data API returned a warning: {'errorCode': 'ZERO_FILTER_OPERATIONS', ... {'match_id': 'fight4', 'round': 1, 'fighters': DataAPISet([UUID('0193... >>> >>> # Filter on the primary key and a regular non-indexed column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"} ... ) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Filter on a regular non-indexed column (and incomplete primary key) >>> # (not recommended performance-wise) >>> my_table.find_one({"round": 3, "winner": "Caio Gozer"}) The Data API returned a warning: {'errorCode': 'MISSING_INDEX', ... {'match_id': 'fight5', 'round': 3, 'fighters': DataAPISet([]), ... >>> >>> # Vector search with "sort" (on an appropriately-indexed vector column): >>> my_table.find_one( ... {}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Donna'} >>> >>> # Hybrid search with vector sort and non-vector filtering: >>> my_table.find_one( ... {"match_id": "fight4"}, ... sort={"m_vector": DataAPIVector([0.2, 0.3, 0.4])}, ... projection={"winner": True}, ... ) {'winner': 'Victor'} >>> >>> # Return the numeric value of the vector similarity >>> # (also demonstrating that one can pass a plain list for a vector): >>> my_table.find_one( ... {}, ... sort={"m_vector": [0.2, 0.3, 0.4]}, ... projection={"winner": True}, ... include_similarity=True, ... ) {'winner': 'Donna', '$similarity': 0.515} >>> >>> # Non-vector sorting on a 'partitionSort' column: >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"round": SortMode.DESCENDING}, ... projection={"winner": True}, ... ) {'winner': 'Caio Gozer'} >>> >>> # Non-vector sorting on a regular column: >>> # (not recommended performance-wise) >>> my_table.find_one( ... {"match_id": "fight5"}, ... sort={"winner": SortMode.ASCENDING}, ... projection={"winner": True}, ... ) The Data API returned a warning: {'errorCode': 'IN_MEMORY_SORTING... {'winner': 'Adam Zuul'} """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) fo_options = ( None if include_similarity is None else {"includeSimilarity": include_similarity} ) fo_payload = self._converter_agent.preprocess_payload( { "findOne": { k: v for k, v in { "filter": filter, "projection": normalize_optional_projection(projection), "options": fo_options, "sort": sort, }.items() if v is not None } } ) fo_response = self._api_commander.request( payload=fo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) if "document" not in (fo_response.get("data") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'document'.", raw_response=fo_response, ) if "projectionSchema" not in (fo_response.get("status") or {}): raise UnexpectedDataAPIResponseException( text="Response from findOne API command missing 'projectionSchema'.", raw_response=fo_response, ) doc_response = fo_response["data"]["document"] if doc_response is None: return None return self._converter_agent.postprocess_row( fo_response["data"]["document"], columns_dict=fo_response["status"]["projectionSchema"], similarity_pseudocolumn="$similarity" if include_similarity else None, )
def info(self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInfo
-
Return information on the table. This should not be confused with the table definition (i.e. the schema).
Args
database_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
database_admin_timeout_ms
. timeout_ms
- an alias for
database_admin_timeout_ms
.
Returns
A TableInfo object for inspection.
Example
>>> # Note: output reformatted for clarity. >>> my_table.info() TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' )
Expand source code
def info( self, *, database_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInfo: """ Return information on the table. This should not be confused with the table definition (i.e. the schema). Args: database_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying DevOps API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `database_admin_timeout_ms`. timeout_ms: an alias for `database_admin_timeout_ms`. Returns: A TableInfo object for inspection. Example: >>> # Note: output reformatted for clarity. >>> my_table.info() TableInfo( database_info=AstraDBDatabaseInfo(id=..., name=..., ...), keyspace='default_keyspace', name='games', full_name='default_keyspace.games' ) """ return TableInfo( database_info=self.database.info( database_admin_timeout_ms=database_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ), keyspace=self.keyspace, name=self.name, full_name=self.full_name, )
def insert_many(self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInsertManyResult
-
Insert a number of rows into the table, with implied overwrite in case of primary key collision.
Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e.
None
,{}
or analogous.Args
rows
- an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null.
ordered
- if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster.
chunk_size
- how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default.
concurrency
- maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply.
request_timeout_ms
- a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply.
timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples.
Examples
>>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... ) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> my_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... ) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ...
Note
Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important.
Note
If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the
ordered
parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of typeTableInsertManyException
: itspartial_result
attribute is precisely aTableInsertManyResult
, encoding details on the successful writes.Expand source code
def insert_many( self, rows: Iterable[ROW], *, ordered: bool = False, chunk_size: int | None = None, concurrency: int | None = None, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertManyResult: """ Insert a number of rows into the table, with implied overwrite in case of primary key collision. Inserting rows whose primary key correspond to entries alredy stored in the table has the effect of an in-place update: the rows are overwritten. However, if the rows being inserted are partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: rows: an iterable of dictionaries, each expressing a row to insert. Each row must at least fully specify the primary key column values, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in each row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. ordered: if False (default), the insertions can occur in arbitrary order and possibly concurrently. If True, they are processed sequentially. If there are no specific reasons against it, unordered insertions re to be preferred as they complete much faster. chunk_size: how many rows to include in each single API request. Exceeding the server maximum allowed value results in an error. Leave it unspecified (recommended) to use the system default. concurrency: maximum number of concurrent requests to the API at a given time. It cannot be more than one for ordered insertions. general_method_timeout_ms: a timeout, in milliseconds, to impose on the whole operation, which may consist of several API requests. If not provided, this object's defaults apply. request_timeout_ms: a timeout, in milliseconds, to impose on each individual HTTP request to the Data API to accomplish the operation. If not provided, this object's defaults apply. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertManyResult object, whose attributes are the primary key of the inserted rows both in the form of dictionaries and of tuples. Examples: >>> # Insert complete and partial rows at once (concurrently) >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_many( ... [ ... { ... "match_id": "fight4", ... "round": 1, ... "winner": "Victor", ... "score": 18, ... "when": DataAPITimestamp.from_string( ... "2024-11-28T11:30:00Z", ... ), ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID('019353e3-00b4-83f9-a127-222222222222'), ... ]), ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... }, ... {"match_id": "fight5", "round": 1, "winner": "Adam"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio"}, ... { ... "match_id": "challenge6", ... "round": 1, ... "winner": "Donna", ... "m_vector": [0.9, -0.1, -0.3], ... }, ... {"match_id": "challenge6", "round": 2, "winner": "Erick"}, ... {"match_id": "challenge6", "round": 3, "winner": "Fiona"}, ... {"match_id": "tournamentA", "round": 1, "winner": "Gael"}, ... {"match_id": "tournamentA", "round": 2, "winner": "Hanna"}, ... { ... "match_id": "tournamentA", ... "round": 3, ... "winner": "Ian", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... {"match_id": "fight7", "round": 1, "winner": "Joy"}, ... {"match_id": "fight7", "round": 2, "winner": "Kevin"}, ... {"match_id": "fight7", "round": 3, "winner": "Lauretta"}, ... ], ... concurrency=10, ... chunk_size=3, ... ) >>> insert_result.inserted_ids [{'match_id': 'fight4', 'round': 1}, {'match_id': 'fight5', ... >>> insert_result.inserted_id_tuples [('fight4', 1), ('fight5', 1), ('fight5', 2), ('fight5', 3), ... >>> >>> # Ordered insertion >>> # (would stop on first failure; predictable end result on DB) >>> my_table.insert_many( ... [ ... {"match_id": "fight5", "round": 1, "winner": "Adam0"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta0"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio0"}, ... {"match_id": "fight5", "round": 1, "winner": "Adam Zuul"}, ... {"match_id": "fight5", "round": 2, "winner": "Betta Vigo"}, ... {"match_id": "fight5", "round": 3, "winner": "Caio Gozer"}, ... ], ... ordered=True, ... ) TableInsertManyResult(inserted_ids=[{'match_id': 'fight5', 'round': 1}, ... Note: Unordered insertions are executed with some degree of concurrency, so it is usually better to prefer this mode unless the order in the row sequence is important. Note: If some of the rows are unsuitable for insertion, for instance have the wrong data type for a column or lack the primary key, the Data API validation check will fail for those specific requests that contain the faulty rows. Depending on concurrency and the value of the `ordered` parameter, a number of rows in general could have been successfully inserted. It is possible to capture such a scenario, and inspect which rows actually got inserted, by catching an error of type `astrapy.exceptions.TableInsertManyException`: its `partial_result` attribute is precisely a `TableInsertManyResult`, encoding details on the successful writes. """ _general_method_timeout_ms, _gmt_label = _first_valid_timeout( (general_method_timeout_ms, "general_method_timeout_ms"), (timeout_ms, "timeout_ms"), ( self.api_options.timeout_options.general_method_timeout_ms, "general_method_timeout_ms", ), ) _request_timeout_ms, _rt_label = _first_valid_timeout( (request_timeout_ms, "request_timeout_ms"), (self.api_options.timeout_options.request_timeout_ms, "request_timeout_ms"), ) if concurrency is None: if ordered: _concurrency = 1 else: _concurrency = DEFAULT_INSERT_MANY_CONCURRENCY else: _concurrency = concurrency if _concurrency > 1 and ordered: raise ValueError("Cannot run ordered insert_many concurrently.") if chunk_size is None: _chunk_size = DEFAULT_INSERT_MANY_CHUNK_SIZE else: _chunk_size = chunk_size _rows = list(rows) logger.info(f"inserting {len(_rows)} rows in '{self.name}'") raw_results: list[dict[str, Any]] = [] timeout_manager = MultiCallTimeoutManager( overall_timeout_ms=_general_method_timeout_ms, timeout_label=_gmt_label, ) if ordered: options = {"ordered": True} inserted_ids: list[Any] = [] inserted_id_tuples: list[Any] = [] for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany on '{self.name}'") chunk_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany on '{self.name}'") # accumulate the results in this call chunk_inserted_ids, chunk_inserted_ids_tuples = ( self._prepare_keys_from_status(chunk_response.get("status")) ) inserted_ids += chunk_inserted_ids inserted_id_tuples += chunk_inserted_ids_tuples raw_results += [chunk_response] # if errors, quit early if chunk_response.get("errors", []): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_response( command=None, raw_response=chunk_response, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result else: # unordered: concurrent or not, do all of them and parse the results options = {"ordered": False} if _concurrency > 1: with ThreadPoolExecutor(max_workers=_concurrency) as executor: def _chunk_insertor( row_chunk: list[dict[str, Any]], ) -> dict[str, Any]: im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": row_chunk, "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") return im_response raw_results = list( executor.map( _chunk_insertor, ( _rows[i : i + _chunk_size] for i in range(0, len(_rows), _chunk_size) ), ) ) else: for i in range(0, len(_rows), _chunk_size): im_payload = self._converter_agent.preprocess_payload( { "insertMany": { "documents": _rows[i : i + _chunk_size], "options": options, }, }, ) logger.info(f"insertMany(chunk) on '{self.name}'") im_response = self._api_commander.request( payload=im_payload, raise_api_errors=False, timeout_context=timeout_manager.remaining_timeout( cap_time_ms=_request_timeout_ms, cap_timeout_label=_rt_label, ), ) logger.info(f"finished insertMany(chunk) on '{self.name}'") raw_results.append(im_response) # recast raw_results. Each response has its schema: unfold appropriately ids_and_tuples_per_chunk = [ self._prepare_keys_from_status(chunk_response.get("status")) for chunk_response in raw_results ] inserted_ids = [ inserted_id for chunk_ids, _ in ids_and_tuples_per_chunk for inserted_id in chunk_ids ] inserted_id_tuples = [ inserted_id_tuple for _, chunk_id_tuples in ids_and_tuples_per_chunk for inserted_id_tuple in chunk_id_tuples ] # check-raise if any( [chunk_response.get("errors", []) for chunk_response in raw_results] ): partial_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) raise TableInsertManyException.from_responses( commands=[None for _ in raw_results], raw_responses=raw_results, partial_result=partial_result, ) # return full_result = TableInsertManyResult( raw_results=raw_results, inserted_ids=inserted_ids, inserted_id_tuples=inserted_id_tuples, ) logger.info(f"finished inserting {len(_rows)} rows in '{self.name}'") return full_result
def insert_one(self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> TableInsertOneResult
-
Insert a single row in the table, with implied overwrite in case of primary key collision.
Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e.
None
,{}
or analogous.Args
row
- a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null.
general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Returns
a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple.
Examples
>>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... ) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ...
Expand source code
def insert_one( self, row: ROW, *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> TableInsertOneResult: """ Insert a single row in the table, with implied overwrite in case of primary key collision. Inserting a row whose primary key correspond to an entry alredy stored in the table has the effect of an in-place update: the row is overwritten. However, if the row being inserted is partially provided, i.e. some columns are not specified, these are left unchanged on the database. To explicitly reset them, specify their value as appropriate to their data type, i.e. `None`, `{}` or analogous. Args: row: a dictionary expressing the row to insert. The primary key must be specified in full, while any other column may be omitted if desired (in which case it is left as is on DB). The values for the various columns supplied in the row must be of the right data type for the insertion to succeed. Non-primary-key columns can also be explicitly set to null. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Returns: a TableInsertOneResult object, whose attributes are the primary key of the inserted row both in the form of a dictionary and of a tuple. Examples: >>> # a full-row insert using astrapy's datatypes >>> from astrapy.data_types import ( ... DataAPISet, ... DataAPITimestamp, ... DataAPIVector, ... ) >>> from astrapy.ids import UUID >>> >>> insert_result = my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "m_vector": DataAPIVector([0.4, -0.6, 0.2]), ... "score": 18, ... "when": DataAPITimestamp.from_string("2024-11-28T11:30:00Z"), ... "winner": "Victor", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... ]), ... }, ... ) >>> insert_result.inserted_id {'match_id': 'mtch_0', 'round': 1} >>> insert_result.inserted_id_tuple ('mtch_0', 1) >>> >>> # a partial-row (which in this case overwrites some of the values) >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 1, ... "winner": "Victor Vector", ... "fighters": DataAPISet([ ... UUID("0193539a-2770-8c09-a32a-111111111111"), ... UUID("0193539a-2880-8875-9f07-222222222222"), ... ]), ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 1} ... >>> >>> # another insertion demonstrating standard-library datatypes in values >>> import datetime >>> >>> my_table.insert_one( ... { ... "match_id": "mtch_0", ... "round": 2, ... "winner": "Angela", ... "score": 25, ... "when": datetime.datetime( ... 2024, 7, 13, 12, 55, 30, 889, ... tzinfo=datetime.timezone.utc, ... ), ... "fighters": { ... UUID("019353cb-8e01-8276-a190-333333333333"), ... }, ... "m_vector": [0.4, -0.6, 0.2], ... }, ... ) TableInsertOneResult(inserted_id={'match_id': 'mtch_0', 'round': 2}, ... """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) io_payload = self._converter_agent.preprocess_payload( {"insertOne": {"document": row}} ) logger.info(f"insertOne on '{self.name}'") io_response = self._api_commander.request( payload=io_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished insertOne on '{self.name}'") if "insertedIds" in io_response.get("status", {}): if not io_response["status"]["insertedIds"]: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command has empty 'insertedIds'.", raw_response=io_response, ) if not io_response["status"]["primaryKeySchema"]: raise UnexpectedDataAPIResponseException( text=( "Response from insertOne API command has " "empty 'primaryKeySchema'." ), raw_response=io_response, ) inserted_id_list = io_response["status"]["insertedIds"][0] inserted_id_tuple, inserted_id = self._converter_agent.postprocess_key( inserted_id_list, primary_key_schema_dict=io_response["status"]["primaryKeySchema"], ) return TableInsertOneResult( raw_results=[io_response], inserted_id=inserted_id, inserted_id_tuple=inserted_id_tuple, ) else: raise UnexpectedDataAPIResponseException( text="Response from insertOne API command missing 'insertedIds'.", raw_response=io_response, )
def list_index_names(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[str]
-
List the names of all indexes existing on this table.
Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of the index names as strings, in no particular order.
Example
>>> my_table.list_index_names() ['m_vector_index', 'winner_index', 'score_index']
Expand source code
def list_index_names( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[str]: """ List the names of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of the index names as strings, in no particular order. Example: >>> my_table.list_index_names() ['m_vector_index', 'winner_index', 'score_index'] """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {}}} logger.info("listIndexes") li_response = self._api_commander.request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return li_response["status"]["indexes"] # type: ignore[no-any-return]
def list_indexes(self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> list[TableIndexDescriptor]
-
List the full definitions of all indexes existing on this table.
Args
table_admin_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
table_admin_timeout_ms
. timeout_ms
- an alias for
table_admin_timeout_ms
.
Returns
a list of
TableIndexDescriptor
objects in no particular order, each providing the details of an index present on the table.Example
>>> indexes = my_table.list_indexes() >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] # Note: shortened >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False
Expand source code
def list_indexes( self, *, table_admin_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> list[TableIndexDescriptor]: """ List the full definitions of all indexes existing on this table. Args: table_admin_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `table_admin_timeout_ms`. timeout_ms: an alias for `table_admin_timeout_ms`. Returns: a list of `astrapy.info.TableIndexDescriptor` objects in no particular order, each providing the details of an index present on the table. Example: >>> indexes = my_table.list_indexes() >>> indexes [TableIndexDescriptor(name='m_vector_index', definition=...)...] # Note: shortened >>> indexes[1].definition.column 'winner' >>> indexes[1].definition.options.case_sensitive False """ _table_admin_timeout_ms, _ta_label = _select_singlereq_timeout_ta( timeout_options=self.api_options.timeout_options, table_admin_timeout_ms=table_admin_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) li_payload: dict[str, Any] = {"listIndexes": {"options": {"explain": True}}} logger.info("listIndexes") li_response = self._api_commander.request( payload=li_payload, timeout_context=_TimeoutContext( request_ms=_table_admin_timeout_ms, label=_ta_label ), ) if "indexes" not in li_response.get("status", {}): raise UnexpectedDataAPIResponseException( text="Faulty response from listIndexes API command.", raw_response=li_response, ) else: logger.info("finished listIndexes") return [ TableIndexDescriptor.coerce(index_object) for index_object in li_response["status"]["indexes"] ]
def to_async(self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> AsyncTable[ROW]
-
Create an AsyncTable from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object).
Args
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
the new copy, an AsyncTable instance.
Example
>>> asyncio.run(my_table.to_async().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... )) {"pk": 1, "column": "value}
Expand source code
def to_async( self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> AsyncTable[ROW]: """ Create an AsyncTable from this one. Save for the arguments explicitly provided as overrides, everything else is kept identical to this table in the copy (the database is converted into an async object). Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the result, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: the new copy, an AsyncTable instance. Example: >>> asyncio.run(my_table.to_async().find_one( ... {"match_id": "fight4"}, ... projection={"winner": True}, ... )) {"pk": 1, "column": "value} """ arg_api_options = APIOptions( embedding_api_key=embedding_api_key, ) final_api_options = self.api_options.with_override(api_options).with_override( arg_api_options ) return AsyncTable( database=self.database.to_async(), name=self.name, keyspace=self.keyspace, api_options=final_api_options, )
def update_one(self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None) ‑> None
-
Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found.
Args
filter
- a predicate expressing the table primary key in full,
i.e. a dictionary defining values for all columns that form the
primary key. An example may be
{"match_id": "fight4", "round": 1}
. update
- the update prescription to apply to the row, expressed
as a dictionary conforming to the Data API syntax. The update
operators for tables are
$set
and$unset
(in particular, setting a column to None has the same effect as the $unset operator). Examples are{"$set": {"round": 12}}
and{"$unset": {"winner": "", "score": ""}}
. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms
- a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.)
request_timeout_ms
- an alias for
general_method_timeout_ms
. timeout_ms
- an alias for
general_method_timeout_ms
.
Examples
>>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> my_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> my_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... )
Note
a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns.
Expand source code
def update_one( self, filter: FilterType, update: dict[str, Any], *, general_method_timeout_ms: int | None = None, request_timeout_ms: int | None = None, timeout_ms: int | None = None, ) -> None: """ Update a single document on the table, changing some or all of the columns, with the implicit behaviour of inserting a new row if no match is found. Args: filter: a predicate expressing the table primary key in full, i.e. a dictionary defining values for all columns that form the primary key. An example may be `{"match_id": "fight4", "round": 1}`. update: the update prescription to apply to the row, expressed as a dictionary conforming to the Data API syntax. The update operators for tables are `$set` and `$unset` (in particular, setting a column to None has the same effect as the $unset operator). Examples are `{"$set": {"round": 12}}` and `{"$unset": {"winner": "", "score": ""}}`. Note that the update operation cannot alter the primary key columns. See the Data API documentation for more details. general_method_timeout_ms: a timeout, in milliseconds, to impose on the underlying API request. If not provided, this object's defaults apply. (This method issues a single API request, hence all timeout parameters are treated the same.) request_timeout_ms: an alias for `general_method_timeout_ms`. timeout_ms: an alias for `general_method_timeout_ms`. Examples: >>> from astrapy.data_types import DataAPISet >>> >>> # Set a new value for a column >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": "Winona"}}, ... ) >>> >>> # Set a new value for a column while unsetting another colum >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"winner": None, "score": 24}}, ... ) >>> >>> # Set a 'set' column to empty >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": DataAPISet()}}, ... ) >>> >>> # Set a 'set' column to empty using None >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": None}}, ... ) >>> >>> # Set a 'set' column to empty using a regular (empty) set >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$set": {"fighters": set()}}, ... ) >>> >>> # Set a 'set' column to empty using $unset >>> my_table.update_one( ... {"match_id": "fight4", "round": 1}, ... update={"$unset": {"fighters": None}}, ... ) >>> >>> # A non-existing primary key creates a new row >>> my_table.update_one( ... {"match_id": "bar_fight", "round": 4}, ... update={"$set": {"score": 8, "winner": "Jack"}}, ... ) >>> >>> # Delete column values for a row (they'll read as None now) >>> my_table.update_one( ... {"match_id": "challenge6", "round": 2}, ... update={"$unset": {"winner": None, "score": None}}, ... ) Note: a row created entirely with update operations (as opposed to insertions) may, correspondingly, be deleted by means of an $unset update on all columns. """ _request_timeout_ms, _rt_label = _select_singlereq_timeout_gm( timeout_options=self.api_options.timeout_options, general_method_timeout_ms=general_method_timeout_ms, request_timeout_ms=request_timeout_ms, timeout_ms=timeout_ms, ) uo_payload = { "updateOne": { k: v for k, v in { "filter": filter, "update": self._converter_agent.preprocess_payload(update), }.items() if v is not None } } logger.info(f"updateOne on '{self.name}'") uo_response = self._api_commander.request( payload=uo_payload, timeout_context=_TimeoutContext( request_ms=_request_timeout_ms, label=_rt_label ), ) logger.info(f"finished updateOne on '{self.name}'") if "status" in uo_response: # the contents are disregarded and the method just returns: return else: raise UnexpectedDataAPIResponseException( text="Faulty response from updateOne API command.", raw_response=uo_response, )
def with_options(self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = (unset), api_options: APIOptions | UnsetType = (unset)) ‑> Table[ROW]
-
Create a clone of this table with some changed attributes.
Args
embedding_api_key
- optional API key(s) for interacting with the table.
If an embedding service is configured, and this parameter is not None,
each Data API call will include the necessary embedding-related headers
as specified by this parameter. If a string is passed, it translates
into the one "embedding api key" header
(i.e.
EmbeddingAPIKeyHeaderProvider
). For some vectorize providers/models, if using header-based authentication, specialized subclasses ofEmbeddingHeadersProvider
should be supplied. api_options
- any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence.
Returns
a new Table instance.
Example
>>> table_with_api_key_configured = my_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... )
Expand source code
def with_options( self: Table[ROW], *, embedding_api_key: str | EmbeddingHeadersProvider | UnsetType = _UNSET, api_options: APIOptions | UnsetType = _UNSET, ) -> Table[ROW]: """ Create a clone of this table with some changed attributes. Args: embedding_api_key: optional API key(s) for interacting with the table. If an embedding service is configured, and this parameter is not None, each Data API call will include the necessary embedding-related headers as specified by this parameter. If a string is passed, it translates into the one "embedding api key" header (i.e. `astrapy.authentication.EmbeddingAPIKeyHeaderProvider`). For some vectorize providers/models, if using header-based authentication, specialized subclasses of `astrapy.authentication.EmbeddingHeadersProvider` should be supplied. api_options: any additional options to set for the clone, in the form of an APIOptions instance (where one can set just the needed attributes). In case the same setting is also provided as named parameter, the latter takes precedence. Returns: a new Table instance. Example: >>> table_with_api_key_configured = my_table.with_options( ... embedding_api_key="secret-key-0123abcd...", ... ) """ return self._copy( embedding_api_key=embedding_api_key, api_options=api_options, )