VAST Vector Database Quickstart#
Retrieve the libadbc_driver_vastdb.so
from your VAST Field Team.
Insert Data#
import pyarrow as pa
import vastdb
from adbc_driver_manager import dbapi
import datetime as dt
# Define parameters, for example:
VASTDB_ENDPOINT = 'XXXXX'
AWS_ACCESS_KEY_ID = 'XXXXX'
AWS_SECRET_ACCESS_KEY = 'XXXXX'
BUCKET_NAME = 'my-bucket1'
SCHEMA_NAME = 'my-schema'
TABLE_NAME = 'my-table'
VAST_ADBC_DRIVER_PATH = './libadbc_driver_vastdb.so'
# Create the table and insert data using VastDB SDK, for more information https://github.com/vast-data/vastdb_sdk?tab=readme-ov-file
session = vastdb.connect(
endpoint=VASTDB_ENDPOINT,
access=AWS_ACCESS_KEY_ID,
secret=AWS_SECRET_ACCESS_KEY)
with session.transaction() as tx:
bucket = tx.bucket(BUCKET_NAME)
# Create the schema in the bucket.
schema = bucket.schema(SCHEMA_NAME) or bucket.create_schema(SCHEMA_NAME)
# Create the table.
dimension = 5
columns = pa.schema([("id", pa.int64()),
("vec", pa.list_(pa.field(name="item", type=pa.float32(), nullable=False), dimension)),
('vec_timestamp', pa.timestamp('us'))])
table = schema.table(TABLE_NAME) or schema.create_table(TABLE_NAME, columns)
# Insert a few rows of data.
arrow_table = pa.table(schema=columns, data=[
[1, 2, 3],
[[1,2,3,4,5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
[dt.datetime(2024, 4, 10, 12, 34),
dt.datetime(2024, 4, 11, 12, 34),
dt.datetime(2024, 4, 13, 12, 34)]
])
table.insert(arrow_table)
Query the Vector Database#
# Query the table using the ADBC driver.
def run_query(query):
with dbapi.connect(
driver=VAST_ADBC_DRIVER_PATH, db_kwargs= {
"vast.db.endpoint": VASTDB_ENDPOINT,
"vast.db.access_key": AWS_ACCESS_KEY_ID,
"vast.db.secret_key": AWS_SECRET_ACCESS_KEY}
) as connection:
with connection.cursor() as cursor:
cursor.execute(query)
display(cursor.fetch_arrow_table().to_pandas())
full_table_name = f'"{BUCKET_NAME}/{SCHEMA_NAME}"."{TABLE_NAME}"'
Select all the rows.#
run_query(
f"SELECT * FROM {full_table_name};"
)
id | vec | vec_timestamp | |
---|---|---|---|
0 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
1 | 2 | [6.0, 7.0, 8.0, 9.0, 10.0] | 2024-04-11 12:34:00 |
2 | 3 | [11.0, 12.0, 13.0, 14.0, 15.0] | 2024-04-13 12:34:00 |
3 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
4 | 2 | [6.0, 7.0, 8.0, 9.0, 10.0] | 2024-04-11 12:34:00 |
... | ... | ... | ... |
61 | 2 | [6.0, 7.0, 8.0, 9.0, 10.0] | 2024-04-11 12:34:00 |
62 | 3 | [11.0, 12.0, 13.0, 14.0, 15.0] | 2024-04-13 12:34:00 |
63 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
64 | 2 | [6.0, 7.0, 8.0, 9.0, 10.0] | 2024-04-11 12:34:00 |
65 | 3 | [11.0, 12.0, 13.0, 14.0, 15.0] | 2024-04-13 12:34:00 |
66 rows × 3 columns
Euclidian distance#
run_query(
f"""
SELECT * FROM {full_table_name}
WHERE vec_timestamp > '2023-04-10 12:34:00'
ORDER BY
array_distance(vec, [1.5, 2.5, 3.5, 4.5, 5.5]::FLOAT[5])
LIMIT 2;
"""
)
id | vec | vec_timestamp | |
---|---|---|---|
0 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
1 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
Cosine distance#
run_query(
f"""
SELECT * FROM {full_table_name}
WHERE vec_timestamp > '2023-12-11 11:30:00'
ORDER BY
array_cosine_distance(vec, [1.5, 2.5, 3.5, 4.5, 5.5]::FLOAT[5])
LIMIT 2;
"""
)
id | vec | vec_timestamp | |
---|---|---|---|
0 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |
1 | 1 | [1.0, 2.0, 3.0, 4.0, 5.0] | 2024-04-10 12:34:00 |