VAST Vector Database Quickstart#

Retrieve the libadbc_driver_vastdb.so from your VAST Field Team.

Insert Data#

import pyarrow as pa
import vastdb
from adbc_driver_manager import dbapi
import datetime as dt

# Define parameters, for example:
VASTDB_ENDPOINT = 'XXXXX'
AWS_ACCESS_KEY_ID = 'XXXXX'
AWS_SECRET_ACCESS_KEY = 'XXXXX'
BUCKET_NAME = 'my-bucket1'
SCHEMA_NAME =  'my-schema'
TABLE_NAME = 'my-table'
VAST_ADBC_DRIVER_PATH = './libadbc_driver_vastdb.so'
# Create the table and insert data using VastDB SDK, for more information https://github.com/vast-data/vastdb_sdk?tab=readme-ov-file

session = vastdb.connect(
    endpoint=VASTDB_ENDPOINT,
    access=AWS_ACCESS_KEY_ID,
    secret=AWS_SECRET_ACCESS_KEY)

with session.transaction() as tx:
    bucket = tx.bucket(BUCKET_NAME)
    # Create the schema in the bucket.
    schema = bucket.schema(SCHEMA_NAME) or bucket.create_schema(SCHEMA_NAME)

    # Create the table.
    dimension = 5
    columns = pa.schema([("id", pa.int64()),
                         ("vec", pa.list_(pa.field(name="item", type=pa.float32(), nullable=False), dimension)),
                         ('vec_timestamp', pa.timestamp('us'))])
    table = schema.table(TABLE_NAME) or schema.create_table(TABLE_NAME, columns)
    # Insert a few rows of data.
    arrow_table = pa.table(schema=columns, data=[
        [1, 2, 3],
        [[1,2,3,4,5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
        [dt.datetime(2024, 4, 10, 12, 34),
         dt.datetime(2024, 4, 11, 12, 34),
         dt.datetime(2024, 4, 13, 12, 34)]
    ])
    table.insert(arrow_table)

Query the Vector Database#

# Query the table using the ADBC driver.

def run_query(query):
    with dbapi.connect(
        driver=VAST_ADBC_DRIVER_PATH, db_kwargs= {
            "vast.db.endpoint": VASTDB_ENDPOINT,
            "vast.db.access_key": AWS_ACCESS_KEY_ID,
            "vast.db.secret_key": AWS_SECRET_ACCESS_KEY}
                                           ) as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            display(cursor.fetch_arrow_table().to_pandas())
full_table_name = f'"{BUCKET_NAME}/{SCHEMA_NAME}"."{TABLE_NAME}"'

Select all the rows.#

run_query(
    f"SELECT * FROM {full_table_name};"
)
id vec vec_timestamp
0 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00
1 2 [6.0, 7.0, 8.0, 9.0, 10.0] 2024-04-11 12:34:00
2 3 [11.0, 12.0, 13.0, 14.0, 15.0] 2024-04-13 12:34:00
3 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00
4 2 [6.0, 7.0, 8.0, 9.0, 10.0] 2024-04-11 12:34:00
... ... ... ...
61 2 [6.0, 7.0, 8.0, 9.0, 10.0] 2024-04-11 12:34:00
62 3 [11.0, 12.0, 13.0, 14.0, 15.0] 2024-04-13 12:34:00
63 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00
64 2 [6.0, 7.0, 8.0, 9.0, 10.0] 2024-04-11 12:34:00
65 3 [11.0, 12.0, 13.0, 14.0, 15.0] 2024-04-13 12:34:00

66 rows × 3 columns

Euclidian distance#

run_query(
    f"""
        SELECT * FROM {full_table_name}
        WHERE vec_timestamp > '2023-04-10 12:34:00'
        ORDER BY
        array_distance(vec, [1.5, 2.5, 3.5, 4.5, 5.5]::FLOAT[5])
        LIMIT 2;
    """
)
id vec vec_timestamp
0 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00
1 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00

Cosine distance#

run_query(
    f"""
        SELECT * FROM {full_table_name}
        WHERE vec_timestamp > '2023-12-11 11:30:00'
        ORDER BY
        array_cosine_distance(vec, [1.5, 2.5, 3.5, 4.5, 5.5]::FLOAT[5])
        LIMIT 2;
    """
)
id vec vec_timestamp
0 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00
1 1 [1.0, 2.0, 3.0, 4.0, 5.0] 2024-04-10 12:34:00