Catalog - Python SDK Examples#
!pip install --quiet vastdb
TODO: This example needs to be improved.#
See: snowch/vast-docker-compose-examples#11
Catalog Snapshots Comparisons#
You can access catalog snapshot by navigating the schema space. The most obvious use of snapshot comparisons is delete detection, followed by move detection. Delete detection Query Returns: This script compares the current state with a specific historical snapshot, identifying files present in the current table but not in the snapshot, based on their element_type and search_path. Access to Snapshot: Access to a snapshot works by querying a specific schema directory (representing the snapshot) within the bucket
with session.transaction() as tx:
snapshots = tx.catalog_snapshots()
for snapshot in snapshots:
print(snapshot.name)
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_13_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_18_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_23_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_28_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_33_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_38_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_43_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_48_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_53_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC
62
import re
timestamp_pattern = r"(\d{4}-\d{2}-\d{2}_\d{2}_\d{2}_\d{2})"
timestamps = []
for snapshot in snapshots:
match = re.search(timestamp_pattern, snapshot.name)
if match:
timestamps.append(match.group(1))
# Convert to datetime objects for sorting
datetime_timestamps = [datetime.strptime(ts, "%Y-%m-%d_%H_%M_%S") for ts in timestamps]
# Get the first and last dates
first_date = min(datetime_timestamps)
last_date = max(datetime_timestamps)
print("First snapshot:", first_date, snapshots[0].name)
print("Last snapshot:", last_date, snapshots[-1].name)
First snapshot: 2025-01-22 18:08:29 vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
Last snapshot: 2025-01-22 22:03:29 vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC
import time
from datetime import datetime
from ibis import _
# TODO: move to with tx
date_obj = first_date
epoch_seconds = int(date_obj.timestamp())
def query_table():
with session.transaction() as tx:
snapshots = tx.catalog_snapshots()
predicate_0 = (_.element_type == 'FILE') & \
(_.search_path == '/') & \
(_.mtime <= ibis.literal(epoch_seconds, type='timestamp'))
predicate_1 = (_.element_type == 'FILE') & \
(_.search_path == '/') & \
(_.mtime > ibis.literal(epoch_seconds, type='timestamp'))
columns = ['parent_path', 'name']
if len(snapshots) < 2:
raise Exception(f"Need at least two snapshots. Found {len(snapshots)}")
df_0 = tx.catalog(snapshot=snapshots[0]).select(columns=columns, predicate=predicate_0).read_all().to_pandas()
df_1 = tx.catalog(snapshot=snapshots[-1]).select(columns=columns, predicate=predicate_1).read_all().to_pandas()
return df_0, df_1
df_0, df_1 = query_table()
df_0
parent_path | name | |
---|---|---|
0 | /dm/s3/endpoint/dm02/r1/d4/ | r1-f5141 |
1 | /dm/s3/endpoint/dm02/r48/d3/ | r48-f3212 |
2 | /dm/s3/endpoint/dm01/r57/d0/ | r57-f3403 |
3 | /scotth/shb1/X1kib/r23/d3/ | r23-f30 |
4 | /dm/s3/endpoint/dm04/r13/d1/ | r13-f2549 |
... | ... | ... |
84594047 | /sven/jan_test/elbencho/r10/d0/ | r10-f6 |
84594048 | /sven/jan_test/elbencho/r21/d0/ | r21-f4 |
84594049 | /sven/jan_test/elbencho/r74/d0/ | r74-f7 |
84594050 | /sven/jan_test/elbencho/r87/d0/ | r87-f5 |
84594051 | /sven/jan_test/elbencho/r64/d0/ | r64-f3 |
84594052 rows × 2 columns
df_1
parent_path | name |
---|
paths_0 = set(df_0['parent_path'] + df_0['name'])
paths_1 = set(df_1['parent_path'] + df_1['name'])
differences = paths_0 - paths_1
differences
if differences:
print(f"\n[INFO] Found {len(differences)} files in first snapshot but not in second:")
for item in sorted(differences):
print(item)
else:
print("\n[INFO] No differences found")