Python SDK Reference#

Important

This notebook is in the process of being migrated to Vast Data Platform Field Docs. It will probably not run yet.

Deploy boto3 library for S3#

We will download the boto3 standard S3 library to get data into the S3 compatible VAST Datastore

!pip install --quiet boto3 | tail -5

Setup demo environment#

Setup Python’s behavior to be more strict, including raising errors for some cases where variables are accidentally undefined.

from __future__ import annotations  # Enable stricter type checking

Import some utility functions for this demo.

import sys  
sys.path.insert(1, '../misc/')

from vastdb_demo_util import get_connection_vars, upload_to_s3, delete_s3_object, list_objects_in_bucket

Upload S3 Object#

import os
import boto3
from botocore.exceptions import NoCredentialsError

#upload_to_s3(local_file_path, bucket_name, s3_file_key, aws_access_key_id, aws_secret_access_key, s3_endpoint)

# List objects in the bucket
print("Uploaded Object")
list_objects_in_bucket(bucket_name, aws_access_key_id, aws_secret_access_key, s3_endpoint, prefix='')
Uploaded Object
Objects in bucket vastdb with prefix '':
nyc-taxi/2011-01.data.parquet
nyc-taxi/2011-02.data.parquet
nyc-taxi/2011-03.data.parquet
nyc-taxi/2011-04.data.parquet
nyc-taxi/2011-05.data.parquet
nyc-taxi/2011-06.data.parquet
nyc-taxi/2011-07.data.parquet
nyc-taxi/2011-08.data.parquet
nyc-taxi/2011-09.data.parquet
nyc-taxi/2011-10.data.parquet
nyc-taxi/2011-11.data.parquet
nyc-taxi/2011-12.data.parquet
nyc-taxi/2012-01.data.parquet
nyc-taxi/2012-02.data.parquet
nyc-taxi/2012-03.data.parquet
nyc-taxi/2012-04.data.parquet
nyc-taxi/2012-05.data.parquet
nyc-taxi/2012-06.data.parquet
nyc-taxi/2012-07.data.parquet
nyc-taxi/2012-08.data.parquet
nyc-taxi/2012-09.data.parquet
nyc-taxi/2012-10.data.parquet
nyc-taxi/2012-11.data.parquet
nyc-taxi/2012-12.data.parquet
nyc-taxi/2013-01.data.parquet
nyc-taxi/2013-02.data.parquet
nyc-taxi/2013-03.data.parquet
nyc-taxi/2013-04.data.parquet
nyc-taxi/2013-05.data.parquet
nyc-taxi/2013-06.data.parquet
nyc-taxi/2013-07.data.parquet
nyc-taxi/2013-08.data.parquet
nyc-taxi/2013-09.data.parquet
nyc-taxi/2013-10.data.parquet
nyc-taxi/2013-11.data.parquet
nyc-taxi/2013-12.data.parquet
nyc-taxi/2014-01.data.parquet
nyc-taxi/2014-02.data.parquet
nyc-taxi/2014-03.data.parquet
nyc-taxi/2014-04.data.parquet
nyc-taxi/2014-05.data.parquet
nyc-taxi/2014-06.data.parquet
nyc-taxi/2014-07.data.parquet
nyc-taxi/2014-08.data.parquet
nyc-taxi/2014-09.data.parquet
nyc-taxi/2014-10.data.parquet
nyc-taxi/2014-11.data.parquet
nyc-taxi/2014-12.data.parquet
nyc-taxi/2015-01.data.parquet
nyc-taxi/2015-02.data.parquet
nyc-taxi/2015-03.data.parquet
nyc-taxi/2015-04.data.parquet
nyc-taxi/2015-05.data.parquet
nyc-taxi/2015-06.data.parquet
nyc-taxi/2015-07.data.parquet
nyc-taxi/2015-08.data.parquet
nyc-taxi/2015-09.data.parquet
nyc-taxi/2015-10.data.parquet
nyc-taxi/2015-11.data.parquet
nyc-taxi/2015-12.data.parquet
nyc-taxi/2016-01.data.parquet
nyc-taxi/2016-02.data.parquet
nyc-taxi/2016-03.data.parquet
nyc-taxi/2016-04.data.parquet
nyc-taxi/2016-05.data.parquet
nyc-taxi/2016-06.data.parquet
nyc-taxi/2016-07.data.parquet
nyc-taxi/2016-08.data.parquet
nyc-taxi/2016-09.data.parquet
nyc-taxi/2016-10.data.parquet
nyc-taxi/2016-11.data.parquet
nyc-taxi/2016-12.data.parquet
nyc-taxi/2017-01.data.parquet
nyc-taxi/2017-02.data.parquet
nyc-taxi/2017-03.data.parquet
nyc-taxi/2017-04.data.parquet
nyc-taxi/2017-05.data.parquet
nyc-taxi/2017-06.data.parquet
nyc-taxi/2017-07.data.parquet
nyc-taxi/2017-08.data.parquet
nyc-taxi/2017-09.data.parquet
nyc-taxi/2017-10.data.parquet
nyc-taxi/2017-11.data.parquet
nyc-taxi/2017-12.data.parquet
nyc-taxi/2018-01.data.parquet
nyc-taxi/2018-02.data.parquet
nyc-taxi/2018-03.data.parquet
nyc-taxi/2018-04.data.parquet
nyc-taxi/2018-05.data.parquet
nyc-taxi/2018-06.data.parquet
nyc-taxi/2018-07.data.parquet
nyc-taxi/2018-08.data.parquet
nyc-taxi/2018-09.data.parquet
nyc-taxi/2018-10.data.parquet
nyc-taxi/2018-11.data.parquet
nyc-taxi/2018-12.data.parquet
nyc-taxi/2019-01.data.parquet
nyc-taxi/2019-02.data.parquet
nyc-taxi/2019-03.data.parquet
nyc-taxi/2019-04.data.parquet
nyc-taxi/2019-05.data.parquet
nyc-taxi/2019-06.data.parquet
tabular_schema_table
tpcds/
vastschema/taxi
print("Uploaded Object")
list_objects_in_bucket(bucket_name, aws_access_key_id, aws_secret_access_key, s3_endpoint, prefix='labimage4.png')
Uploaded Object
No objects found in bucket vastdb with prefix 'labimage4.png'.

Add user tags to existing files/objects using S3 API#

import boto3
from botocore.exceptions import NoCredentialsError

def add_tags_to_s3_object(access_key, secret_key, endpoint_url, bucket_name, key, tags):
    # Create an S3 client with custom configurations
    s3 = boto3.client(
        's3',
        aws_access_key_id=access_key,
        aws_secret_access_key=secret_key,
        endpoint_url=endpoint_url,
        verify=False  # Set to False to disable SSL verification
    )

    # Add tags to the existing S3 object
    s3.put_object_tagging(
        Bucket=bucket_name,
        Key=key,
        Tagging={'TagSet': tags}
    )

if __name__ == "__main__":
    # Replace these values with your AWS credentials, S3 endpoint, bucket name, object key, and desired tags
    aws_access_key = 'BGU86QBVQBX0A71S4PTL'
    aws_secret_key = 'LOUtdwqsgoyUxc5EG+204RRi1gQqbm1wNWJGMbJ0'
    s3_endpoint = 'http://localhost:55555'  # e.g., 'https://s3.example.com'
    bucket_name = 'vastdb'
    object_key = 'nyc-taxi/2015-10.data.parquet'
    user_defined_tags = [
        {'Key': 'Location', 'Value': 'Boston'},
        {'Key': 'Lab_Device_ID', 'Value': '11111'},
        {'Key': 'Type', 'Value': 'xRNA'},
        {'Key': 'Experiment_ID', 'Value': '3333'},
        {'Key': 'Status', 'Value': 'Processing'}
        # Add more tags as needed
    ]

    try:
        add_tags_to_s3_object(aws_access_key, aws_secret_key, s3_endpoint, bucket_name, object_key, user_defined_tags)
        print(f"Tags added successfully to S3 object: s3://{bucket_name}/{object_key}")
    except NoCredentialsError:
        print("AWS credentials not available.")
    except Exception as e:
        print(f"Error adding tags: {e}")
Tags added successfully to S3 object: s3://vastdb/nyc-taxi/2015-10.data.parquet

Get user tags to existing files/objects using S3 API#

import boto3
from botocore.exceptions import NoCredentialsError

def list_tags_of_s3_object(access_key, secret_key, endpoint_url, bucket_name, key):
    # Create an S3 client with custom configurations
    s3 = boto3.client(
        's3',
        aws_access_key_id=access_key,
        aws_secret_access_key=secret_key,
        endpoint_url=endpoint_url,
        verify=False  # Set to False to disable SSL verification
    )

    # Get the tags associated with the S3 object
    response = s3.get_object_tagging(
        Bucket=bucket_name,
        Key=key
    )

    return response['TagSet']

if __name__ == "__main__":
    # Replace these values with your AWS credentials, S3 endpoint, bucket name, and object key
    aws_access_key = 'BGU86QBVQBX0A71S4PTL'
    aws_secret_key = 'LOUtdwqsgoyUxc5EG+204RRi1gQqbm1wNWJGMbJ0'
    s3_endpoint = 'http://localhost:55555'  # e.g., 'https://s3.example.com'
    bucket_name = 'vastdb'
    object_key = 'nyc-taxi/2018-06.data.parquet'

    try:
        object_tags = list_tags_of_s3_object(aws_access_key, aws_secret_key, s3_endpoint, bucket_name, object_key)
        
        # Print the tags associated with the S3 object
        print(f"Tags for S3 object: s3://{bucket_name}/{object_key}")
        for tag in object_tags:
            print(f"Key: {tag['Key']}, Value: {tag['Value']}")
            
    except NoCredentialsError:
        print("AWS credentials not available.")
    except Exception as e:
        print(f"Error listing tags: {e}")
Tags for S3 object: s3://vastdb/nyc-taxi/2018-06.data.parquet
Key: Experiment_ID, Value: 3333
Key: Lab_Device_ID, Value: 11111
Key: Location, Value: Basel
Key: Status, Value: Processing
Key: Type, Value: xRNA

Create VAST DB Catalog Session#

from vastdb.api import VastdbApi
import pyarrow as pa

def create_vastdb_session(access_key, secret_key):
    return VastdbApi(host='localhost:55555', access_key=access_key, secret_key=secret_key)


bucket_name='vastdb'
schema_name='python-sdk'
table='pythonsdk'

access_key='BGU86QBVQBX0A71S4PTL'
secret_key='LOUtdwqsgoyUxc5EG+204RRi1gQqbm1wNWJGMbJ0'
vastdb_session = create_vastdb_session(access_key, secret_key)
field_names = ['element_type'] # Only need the element_type field for counting
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', field_names=field_names, num_sub_splits=8)
df = table.to_pandas()

How many elements are in the catalog#

total_elements = len(df)
print(f"Total elements in the catalog: {total_elements}")
Total elements in the catalog: 345

How many files/objects?#

file_count = len(df[df['element_type'] == 'FILE'])
print(f"Number of files/objects: {file_count}")
Number of files/objects: 163

How many directories?#

dir_count = len(df[df['element_type'] == 'DIR'])
print(f"Number of directories: {dir_count}")
Number of directories: 179

How many Database tables?#

table_count = len(df[df['element_type'] == 'TABLE'])
print(f"Number of database tables: {table_count}")
Number of database tables: 2

What are all of the elements on my system anyway?#

distinct_elements = df['element_type'].unique()
print("Distinct element types on the system:")
print(distinct_elements)
Distinct element types on the system:
['FILE' 'DIR' 'SCHEMA' 'TABLE']

Simplified example of count of elements returned from parallel execution#

The query_iterator iteratively executes a query on a database table, returning results in chunks as PyArrow RecordBatches, enabling efficient handling of large datasets by processing data in smaller, manageable segments. Simplified example of count of elements returned from parallel execution.

def query_and_count_elements(session, bucket, schema, table, field_names):
    elements_count = 0

    for record_batch in session.query_iterator(bucket, schema, table, field_names=field_names, num_sub_splits=8):
        elements_count += len(record_batch)

    return elements_count

# Query Parameters
field_names = ['element_type']  # Only need the element_type field for counting

# Perform the query
total_elements = query_and_count_elements(
    vastdb_session, 'vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', field_names
)
print(f"Total elements in the catalog: {total_elements}")
Total elements in the catalog: 345

Simple Filtering#

import time
import datetime

date_str = '2023-11-24'
pattern = '%Y-%m-%d'
epoch = int(time.mktime(time.strptime(date_str, pattern)))

filters = {
    'owner_name': ['eq trinos3'],
    'size': ['gt 50000'],
    'creation_time': [f'gt {epoch}'],
    'tag_Location': ['eq Basel']
}

field_names = ['name', 'creation_time', 'uid', 'owner_name', 'size', 'user_metadata', 'user_tags', 'tag_Location']

table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)

df = table.to_pandas()
display(df)
name creation_time uid owner_name size user_metadata user_tags tag_Location
0 2018-06.data.parquet 2024-03-05 15:03:21.610385219 99999 trinos3 141083662 [(s3cmd-attrs, atime:1709650655/ctime:17096506... [(Experiment_ID, 3333), (Lab_Device_ID, 11111)... Basel
1 2019-02.data.parquet 2024-03-05 15:03:28.300958652 99999 trinos3 120274946 [(s3cmd-attrs, atime:1709650666/ctime:17096506... [(Experiment_ID, 3333), (Lab_Device_ID, 11111)... Basel
display(df.head(1)['user_metadata'].values)
array([list([('s3cmd-attrs', 'atime:1709650655/ctime:1709650656/gid:1000/gname:vastdata/md5:76b073e4d79f2a77cb19d443e0b6d062/mode:33261/mtime:1709650656/uid:1000/uname:vastdata')])],
      dtype=object)
display(df.head(1)['user_tags'].values)
array([list([('Experiment_ID', '3333'), ('Lab_Device_ID', '11111'), ('Location', 'Basel'), ('Status', 'Processing'), ('Type', 'xRNA')])],
      dtype=object)

Query for Specific File Types Across Different Users:#

field_names = ['uid', 'owner_name', 'element_type']
filters = {
    'element_type': ['eq FILE', 'eq TABLE', 'eq DIR'],
    'uid': ['eq 500', 'eq 1000']
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)
uid owner_name element_type
0 1000 vastdata FILE
1 1000 vastdata FILE
2 1000 vastdata FILE
3 1000 vastdata FILE
4 1000 vastdata FILE
... ... ... ...
477787 1000 vastdata FILE
477788 1000 vastdata FILE
477789 1000 vastdata FILE
477790 1000 vastdata FILE
477791 1000 vastdata FILE

477792 rows × 3 columns

Query for Objects Based on User and Specific Extensions#

field_names = ['uid', 'extension', 'size']
filters = {
    'uid': ['eq 1000', 'eq 555'],
    'extension': ['eq log', 'eq ldb']  # looking for log and ldb files
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)
uid extension size
0 1000 log 45511
1 1000 log 4050387
2 1000 log 1730
3 1000 log 104
4 1000 log 240974
5 1000 log 47233
6 1000 log 73391
7 1000 log 77396
8 1000 log 47334
9 1000 log 40836
10 1000 log 31460
11 1000 log 1
12 1000 log 20002
13 1000 log 16541
14 1000 log 30185
15 1000 log 16649
16 1000 log 65283
17 1000 log 16591
18 1000 log 17126
19 1000 log 2297
20 1000 log 6602790
21 1000 log 2421070
22 1000 log 30947
23 1000 log 112871
24 1000 log 963022
25 1000 log 12696
26 1000 log 53285
27 1000 log 30947

Query for Specific File Types with Size Constraints#

field_names = ['element_type', 'size', 'name']
filters = {
    'element_type': ['eq FILE'],
    'size': ['gt 50000', 'lt 1000000']  # size between 50 KB and 1 MB
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)
element_type size name
0 FILE 442558237 2012-02.data.parquet
1 FILE 16777216 upload
2 FILE 146758259 2018-04.data.parquet
3 FILE 144467145 2018-10.data.parquet
4 FILE 390453487 2013-09.data.parquet
... ... ... ...
158 FILE 67108864 upload
159 FILE 346955670 2014-01.data.parquet
160 FILE 376498312 2013-01.data.parquet
161 FILE 145782225 2017-11.data.parquet
162 FILE 67108864 upload

163 rows × 3 columns

Query for Large TABLE Objects by Specific Users#

field_names = ['uid', 'owner_name', 'size', 'element_type']
filters = {
    'uid': ['eq 555'],
    'element_type': ['eq TABLE'],
    'size': ['gt 10000000']  # greater than 10 MB
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
print(df)

Timestamp Filtering#

Query by birthdate: VAST uses a “creation_time” column to indicate when a new element is created: This will output all objects linked after noon on September 1st. It will not output files that have been moved to a new path.

NOTE : Same method can be applied for acces-time (atime), modification-time (mtime) & metadata-update-times (ctime).

# i.e: SELECT CONCAT(parent_path, name) FROM vast_big_catalog_table WHERE creation_time > TIMESTAMP '2023-09-01 12:00:01'

# Set the timestamp for comparison
timestamp_birthdate = pd.Timestamp('2023-09-01 12:00:01')

# Convert the timestamp to an integer
timestamp_birthdate_int = int(timestamp_birthdate.timestamp())

# Query the database
field_names = ['creation_time', 'parent_path', 'name']
filters = {'creation_time': [f'gt {timestamp_birthdate_int}']}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()

# Filter and concatenate paths
df_filtered = df[df['creation_time'] > timestamp_birthdate]
df_filtered['full_path'] = df_filtered['parent_path'] + df_filtered['name']

# Print result
print("Objects created after 2023-09-01 12:00:01:")
display(df_filtered['full_path'])

Reporting#

Simple queries to tell you basic statistics on a section of the namespace Report statistics on parts of the namespace - Summarizing files of a certain type (FILE), belonging to a specific user (uid=555), and located in a certain path (/parquet-files-bucket)

import numpy as np

# Query the database
field_names = ['uid', 'used', 'size']
filters = {
    'search_path': ['eq /nyc-taxi'],
    'element_type': ['eq FILE']
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()

# Check if DataFrame is empty
if df.empty:
    print("No data returned from query. Please check filters and field names.")
else:
    # Perform aggregations
    users_count = df['uid'].nunique()
    files_count = len(df)
    kb_used_sum = df['used'].sum() / 1000
    avg_size_kb = df['size'].mean() / 1000

    # Formatting results
    formatted_results = {
        'users': f"{users_count:,d}",
        'Files': f"{files_count:,d}",
        'KB_Used': f"{kb_used_sum:,.0f}",
        'Avg_Size_KB': f"{avg_size_kb:,.2f}"
    }

    # Print formatted results
    print("Aggregated Results:")
    print(formatted_results)
No data returned from query. Please check filters and field names.

Capacity Grouping & Usage report#

Here’s a report on all the users on the system: Get Files across whole system(‘/’), group by owner_name, sum files, total and average size in kilobytes, oldest creation time, and most recent access time for each file owner. Note - display is a IPython function which aggregates results in table format

from IPython.display import display
import pandas as pd
import numpy as np

# Querying the database
filters = {
    'element_type': ['eq FILE'],
    'search_path': ['eq /']
}
field_names = ['owner_name', 'used', 'size', 'creation_time', 'atime']

table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
pd.options.display.max_columns = None

# Aggregating data
aggregated_data = df.groupby('owner_name').agg(
    Files=('owner_name', 'count'),
    KB_Used=('used', lambda x: np.sum(x)/1000),
    Avg_Size_KB=('size', lambda x: np.mean(x)/1000),
    Oldest_data=('creation_time', 'min'),
    Last_access=('atime', 'max')
)

# Formatting results
aggregated_data['Files'] = aggregated_data['Files'].apply(lambda x: f"{x:,d}")
aggregated_data['KB_Used'] = aggregated_data['KB_Used'].apply(lambda x: f"{x:,.0f}")
aggregated_data['Avg_Size_KB'] = aggregated_data['Avg_Size_KB'].apply(lambda x: f"{x:,.2f}")

display(aggregated_data)
Files KB_Used Avg_Size_KB Oldest_data Last_access
owner_name
0 40 0 0.00 2024-03-05 15:21:31.916288773 2024-03-05 15:25:42.213137875
trinos3 123 30,963,037 251,732.01 2024-03-05 15:00:39.594759193 2024-03-05 15:25:42.864619921

Catalog Snapshots Comparisons#

You can access catalog snapshot by navigating the schema space. The most obvious use of snapshot comparisons is delete detection, followed by move detection. Delete detection Query Returns: This script compares the current state with a specific historical snapshot, identifying files present in the current table but not in the snapshot, based on their element_type and search_path. Access to Snapshot: Access to a snapshot works by querying a specific schema directory (representing the snapshot) within the bucket

def query_table(schema):
    table = vastdb_session.query('vast-big-catalog-bucket', schema, 'vast_big_catalog_table', filters=filters, num_sub_splits=8)
    df = table.to_pandas()
    df['full_path'] = df['parent_path'] + df['name']
    return set(df['full_path'])

# Query Filters
filters = {
    'element_type': ['eq FILE'],
    'search_path': ['eq /']
}

# Query the current table and the snapshot
current_set = query_table('vast_big_catalog_schema')
snapshot_set = query_table('.snapshot/bc_table_2023-12-10_13_53_36/vast_big_catalog_schema')

# Find differences (Current Table vs Snapshot)
difference = current_set - snapshot_set

# Output
if difference:
    print(f"[INFO] Found {len(difference)} files in the current table but not in the snapshot:")
    for item in difference:
        print(item)
else:
    print("[INFO] No differences found")