Catalog - Python SDK Examples#

!pip install --quiet vastdb

S3 Metadata (Tags)#

! s3cmd_configure.sh # custom script to setup s3cmd connection detail
! s3cmd ls s3://csnow-bucket/nyt/ | awk 'NR<=10' # just show a few rows (10)
2025-01-22 14:57    481274128  s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
2024-12-09 15:54    458846485  s3://csnow-bucket/nyt/yellow_tripdata_2009-02.parquet
2024-12-09 15:55    498225013  s3://csnow-bucket/nyt/yellow_tripdata_2009-03.parquet
2024-12-09 15:56    494119681  s3://csnow-bucket/nyt/yellow_tripdata_2009-04.parquet
2024-12-09 15:56    515203538  s3://csnow-bucket/nyt/yellow_tripdata_2009-05.parquet
2024-12-09 15:57    491071155  s3://csnow-bucket/nyt/yellow_tripdata_2009-06.parquet
2024-12-09 15:57    471955931  s3://csnow-bucket/nyt/yellow_tripdata_2009-07.parquet
2024-12-09 15:58    477035993  s3://csnow-bucket/nyt/yellow_tripdata_2009-08.parquet
2024-12-09 15:58    488439674  s3://csnow-bucket/nyt/yellow_tripdata_2009-09.parquet
2024-12-09 15:59    543738743  s3://csnow-bucket/nyt/yellow_tripdata_2009-10.parquet

Remove previous user tags#

! s3cmd modify --remove-header='x-amz-meta-foo' s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]
! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:43 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL

Add user tags#

! s3cmd modify --add-header=x-amz-meta-foo:bar s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]
! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:45 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL
   x-amz-meta-foo: bar

Create VAST DB Catalog Session#

import os

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
---
import pyarrow as pa
import vastdb

session = vastdb.connect(
    endpoint=VASTDB_ENDPOINT,
    access=VASTDB_ACCESS_KEY,
    secret=VASTDB_SECRET_KEY)
field_names = ['element_type'] # Only need the element_type field for counting

with session.transaction() as tx:

    # we will work with patable functionality
    pa_table = tx.catalog().select(columns=field_names).read_all()

    # you can also work with pandas df, but pa_table is probably quicker
    pdf = pa_table.to_pandas()

How many elements are in the catalog#

total_elements = pa_table.num_rows
print(f"Total elements in the catalog: {total_elements}")
Total elements in the catalog: 84836469

How many files/objects?#

file_count = pa_table.filter(pa.compute.field("element_type") == "FILE").num_rows
print(f"Number of files/objects: {file_count}")
Number of files/objects: 84692655

How many directories?#

dir_count = pa_table.filter(pa.compute.field("element_type") == "DIR").num_rows
print(f"Number of directories: {dir_count}")
Number of directories: 143671

How many Database tables?#

dir_count = pa_table.filter(pa.compute.field("element_type") == "TABLE").num_rows
print(f"Number of directories: {dir_count}")
Number of directories: 25

What are all of the elements on my system anyway?#

group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([])
group_counts.to_pandas()
element_type
0 FILE
1 DIR
2 SYMLINK
3 SCHEMA
4 TABLE

Alternative count approach#

group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([([], "count_all")])
group_counts.to_pandas()
element_type count_all
0 FILE 84692655
1 DIR 143671
2 SYMLINK 106
3 TABLE 25
4 SCHEMA 12

Simplified example of count of elements returned from parallel execution#

The query_iterator iteratively executes a query on a database table, returning results in chunks as PyArrow RecordBatches, enabling efficient handling of large datasets by processing data in smaller, manageable segments. Simplified example of count of elements returned from parallel execution.

def query_and_count_elements(session, field_names):

    with session.transaction() as tx:
        # batch reader
        reader = tx.catalog().select(columns=field_names)

        elements_count = 0
        for record_batch in reader:
            elements_count += record_batch.num_rows

        return elements_count

# Query Parameters
field_names = ['element_type']  # Only need the element_type field for counting

# Perform the query
total_elements = query_and_count_elements(session, field_names)
print(f"Total elements in the catalog: {total_elements}")
Total elements in the catalog: 84836469

Simple Filtering#

  • first filter by pushing down to the DB predicates to search only for changes since today

import ibis
import time
import pyarrow.compute as pc
from datetime import datetime
from ibis import _


# today's date - should pick
date_str = time.strftime('%Y-%m-%d')

# Convert date string to epoch timestamp (in seconds)
epoch_seconds = int(time.mktime(time.strptime(date_str, '%Y-%m-%d')))

# Create an Ibis literal with epoch time and the correct data type
predicate = (_.mtime >= ibis.literal(epoch_seconds, type='timestamp'))

field_names = ['name', 'creation_time', 'uid', 'owner_name', 'size', 'user_metadata', 'user_tags']

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df
name creation_time uid owner_name size user_metadata user_tags
0 file_15624.txt 2025-01-22 14:45:21.141552984 5069 John Gorski 20480 None None
1 file_2386.txt 2025-01-22 14:46:03.517637761 5069 John Gorski 20480 None None
2 file_21513.txt 2025-01-22 14:45:35.738343786 5069 John Gorski 20480 None None
3 subfolder_1 2025-01-22 14:45:52.032227101 5069 John Gorski 4096 None None
4 file_14682.txt 2025-01-22 14:46:06.120785539 5069 John Gorski 20480 None None
... ... ... ... ... ... ... ...
84177 file_27536.txt 2025-01-22 15:21:04.826954108 5069 John Gorski 20480 None None
84178 file_15986.txt 2025-01-22 15:20:28.161570268 5069 John Gorski 20480 None None
84179 file_29581.txt 2025-01-22 15:19:00.046412901 5069 John Gorski 20480 None None
84180 file_8016.txt 2025-01-22 15:19:17.964053479 5069 John Gorski 20480 None None
84181 file_9210.txt 2025-01-22 15:20:16.693873580 5069 John Gorski 20480 None None

84182 rows × 7 columns

  • then post filter the returned dataset to search for user_meta data not null

  • we should see the file we updated using s3cmd

df[df['user_tags'].notnull()]
name creation_time uid owner_name size user_metadata user_tags
24735 yellow_tripdata_2009-01.parquet 2024-12-09 15:54:27.143172336 5102 Chris Snow 481274128 [(foo, bar)] [(key1, value1), (key2, value2)]

Query for Specific File Types Across Different Users:#

field_names = ['uid', 'owner_name', 'element_type']
predicate = \
    ((_.element_type == 'FILE') | (_.element_type == 'TABLE') | (_.element_type == 'DIR')) & \
    ((_.uid == 500) | (_.uid == 1000))

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df
uid owner_name element_type
0 1000 vastdata FILE
1 1000 vastdata FILE
2 1000 vastdata FILE
3 1000 vastdata FILE
4 1000 vastdata FILE
... ... ... ...
40386704 1000 vastdata FILE
40386705 1000 vastdata FILE
40386706 1000 vastdata FILE
40386707 1000 vastdata FILE
40386708 1000 vastdata FILE

40386709 rows × 3 columns

Query for Objects Based on User and Specific Extensions#

field_names = ['uid', 'extension', 'size']
predicate = \
    ((_.extension == 'log') | (_.extension == 'ldb')) & \
    ((_.uid == 555) | (_.uid == 1000))

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df
uid extension size
0 1000 log 69641674
1 1000 log 115
2 1000 log 60046685
3 1000 log 115
4 1000 log 39745
... ... ... ...
52808 1000 log 0
52809 1000 log 0
52810 1000 log 0
52811 1000 log 72921728
52812 1000 log 39765

52813 rows × 3 columns

Query for Specific File Types with Size Constraints#

field_names = ['element_type', 'size', 'name']
predicate = \
    (_.element_type == 'FILE') & \
    ((_.size > 50000) & (_.size < 1000000)) # size between 50 KB and 1 MB

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df
element_type size name
0 FILE 128609 NA19191.seg.called.merged
1 FILE 182032 Theilman.mp4
2 FILE 172816 out-of-townish.exr
3 FILE 231184 beflagged.json
4 FILE 401168 headgate.xml
... ... ... ...
9830564 FILE 131072 r96-f243
9830565 FILE 131072 r168-f156
9830566 FILE 131072 r168-f222
9830567 FILE 131072 r108-f75
9830568 FILE 131072 r229-f47

9830569 rows × 3 columns

Query for Large TABLE Objects by Specific Users#

field_names = ['uid', 'owner_name', 'size', 'element_type']
predicate = \
    (_.uid == 5102) & \
    (_.element_type == 'TABLE') & \
    (_.size > 10000000) # greater than 10 MB


with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df
uid owner_name size element_type
0 5102 Chris Snow 295359979974 TABLE
1 5102 Chris Snow 30638775201 TABLE
2 5102 Chris Snow 225006991 TABLE
3 5102 Chris Snow 193123235385 TABLE
4 5102 Chris Snow 144481791 TABLE

Timestamp Filtering#

Query by birthdate: VAST uses a “creation_time” column to indicate when a new element is created: This will output all objects linked after noon on September 1st. It will not output files that have been moved to a new path.

NOTE : Same method can be applied for acces-time (atime), modification-time (mtime) & metadata-update-times (ctime).

import pandas as pd
from ibis import _

# increase display width
pd.set_option('display.max_colwidth', 1000)  # Limit string length to 1000 characters
pd.set_option('display.width', 1000)  # Set the total display width to 1000 characters

# Original timestamp filtering
timestamp_birthdate = pd.Timestamp('2025-01-01 12:00:01')

# Modern predicate approach
predicate = (_.creation_time > timestamp_birthdate) 

field_names = ['creation_time', 'parent_path', 'name']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    
    df = table.to_pandas()
    
# Add full path column
df['full_path'] = df['parent_path'] + df['name']

# Display results
print("Objects created after 2025-01-01 12:00:01:")
df['full_path']
Objects created after 2025-01-01 12:00:01:
0                                                                                                                                                      /lukes3/small_16k/r96/d0/r96-f5
1                                                                                                                                                    /lukes3/small_16k/r44/d3/r44-f115
2                                                                                                                                                    /lukes3/small_16k/r14/d9/r14-f195
3                                                                                                                                                    /lukes3/small_16k/r32/d8/r32-f212
4                                                                                                                                                    /lukes3/small_16k/r94/d0/r94-f253
                                                                                              ...                                                                                     
4490756    /venable/deep_filesystem/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/file_30544.txt
4490757    /venable/deep_filesystem/subfolder_2/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_1/file_25566.txt
4490758    /venable/deep_filesystem/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/file_31592.txt
4490759     /venable/deep_filesystem/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_1/file_2372.txt
4490760    /venable/deep_filesystem/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_2/file_14637.txt
Name: full_path, Length: 4490761, dtype: object

Reporting#

Simple queries to tell you basic statistics on a section of the namespace Report statistics on parts of the namespace - Summarizing files of a certain type (FILE), belonging to a specific user (uid=555), and located in a certain path (/parquet-files-bucket).

The previous query returned:

Objects created after 2025-01-01 12:00:01:

0  /lukes3/small_16k/r96/d0/r96-f5
1  /lukes3/small_16k/r44/d3/r44-f115
...

Let’s use the /lukes3 search path.

import numpy as np

predicate = (_.search_path == '/lukes3') & (_.element_type == 'FILE')
field_names = ['uid', 'used', 'size']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    df = table.to_pandas()

if df.empty:
    print("No data returned from query. Please check filters and field names.")
else:
    formatted_results = {
        'users': f"{df['uid'].nunique():,d}",
        'Files': f"{len(df):,d}",
        'KB_Used': f"{(df['used'].sum() / 1000):,.0f}",
        'Avg_Size_KB': f"{(df['size'].mean() / 1000):,.2f}"
    }
    print("Aggregated Results:")
    print(formatted_results)
Aggregated Results:
{'users': '1', 'Files': '655,360', 'KB_Used': '10,737,418', 'Avg_Size_KB': '16.38'}

Capacity Grouping & Usage report#

Here’s a report on all the users on the system: Get Files across whole system(‘/’), group by owner_name, sum files, total and average size in kilobytes, oldest creation time, and most recent access time for each file owner. Note - display is a IPython function which aggregates results in table format

from IPython.display import display
import pandas as pd
import numpy as np

predicate = (_.element_type == 'FILE') & (_.search_path == '/')
field_names = ['owner_name', 'used', 'size', 'creation_time', 'atime']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    df = table.to_pandas()

pd.options.display.max_columns = None

aggregated_data = df.groupby('owner_name').agg(
    Files=('owner_name', 'count'),
    KB_Used=('used', lambda x: np.sum(x)/1000),
    Avg_Size_KB=('size', lambda x: np.mean(x)/1000),
    Oldest_data=('creation_time', 'min'),
    Last_access=('atime', 'max')
)

# Format numeric columns separately
aggregated_data['Files'] = aggregated_data['Files'].map(lambda x: f"{int(x):,d}")
aggregated_data['KB_Used'] = aggregated_data['KB_Used'].map(lambda x: f"{x:,.0f}")
aggregated_data['Avg_Size_KB'] = aggregated_data['Avg_Size_KB'].map(lambda x: f"{x:,.2f}")

display(aggregated_data)
Files KB_Used Avg_Size_KB Oldest_data Last_access
owner_name
0 15,471,800 14,186,299,798 916.91 2024-11-13 01:24:02.260109671 2025-01-22 03:38:06.346023569
65534 161 171,798,692 1,067,072.62 2024-12-19 12:48:08.385283415 2024-12-23 15:19:24.340852233
Chris Snow 291 24,002,206 82,481.81 2024-11-19 12:53:55.710455713 2024-12-09 16:32:40.094875938
David Petika 70 219 3.13 2024-11-13 12:02:56.050907794 2024-12-24 16:11:46.504352451
Edwin Nadar 1 0 0.01 2025-01-16 09:24:52.648151168 2025-01-16 09:24:52.668121034
... ... ... ... ... ...
vperfsanity-james 71,072 54,768,920 770.61 2024-11-12 20:54:25.382017166 2024-11-12 17:13:30.648724050
vperfsanity-sven 126,440 96,829,335 765.81 2024-11-12 21:03:31.871820181 2024-11-12 17:48:59.729557892
wcash-flow 146,852 113,762,566 774.67 2024-11-12 18:41:59.497452192 2024-11-12 18:36:38.979644341
wong.tran-flow 117,288 90,381,655 770.60 2024-11-12 23:38:24.134942342 2024-11-12 18:05:46.249753724
yogi.berra-flow 12,372 9,464,880 765.02 2024-11-12 19:10:25.253624458 2024-11-12 18:12:11.261864289

174 rows × 5 columns

TODO: This example needs to be improved.#

See: snowch/vast-docker-compose-examples#11

Catalog Snapshots Comparisons#

You can access catalog snapshot by navigating the schema space. The most obvious use of snapshot comparisons is delete detection, followed by move detection. Delete detection Query Returns: This script compares the current state with a specific historical snapshot, identifying files present in the current table but not in the snapshot, based on their element_type and search_path. Access to Snapshot: Access to a snapshot works by querying a specific schema directory (representing the snapshot) within the bucket

with session.transaction() as tx:
    snapshots = tx.catalog_snapshots()
    for snapshot in snapshots:
        print(snapshot.name)
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_13_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_18_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_23_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_28_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_33_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_38_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_43_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_48_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_53_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC
62
import re

timestamp_pattern = r"(\d{4}-\d{2}-\d{2}_\d{2}_\d{2}_\d{2})"
timestamps = []

for snapshot in snapshots:
    match = re.search(timestamp_pattern, snapshot.name)
    if match:
        timestamps.append(match.group(1))

# Convert to datetime objects for sorting
datetime_timestamps = [datetime.strptime(ts, "%Y-%m-%d_%H_%M_%S") for ts in timestamps]

# Get the first and last dates
first_date = min(datetime_timestamps)
last_date = max(datetime_timestamps)

print("First snapshot:", first_date, snapshots[0].name)
print("Last snapshot:", last_date, snapshots[-1].name)
First snapshot: 2025-01-22 18:08:29 vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
Last snapshot: 2025-01-22 22:03:29 vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC
import time
from datetime import datetime
from ibis import _

# TODO: move to with tx
date_obj = first_date
epoch_seconds = int(date_obj.timestamp())

def query_table():

    with session.transaction() as tx:
        snapshots = tx.catalog_snapshots()

        predicate_0 = (_.element_type == 'FILE') & \
                    (_.search_path == '/') & \
                    (_.mtime <= ibis.literal(epoch_seconds, type='timestamp'))
        
        predicate_1 = (_.element_type == 'FILE') & \
                    (_.search_path == '/') & \
                    (_.mtime > ibis.literal(epoch_seconds, type='timestamp'))
        
        columns = ['parent_path', 'name']

        if len(snapshots) < 2:
            raise Exception(f"Need at least two snapshots. Found {len(snapshots)}")

        df_0 = tx.catalog(snapshot=snapshots[0]).select(columns=columns, predicate=predicate_0).read_all().to_pandas()
        df_1 = tx.catalog(snapshot=snapshots[-1]).select(columns=columns, predicate=predicate_1).read_all().to_pandas()

    return df_0, df_1
df_0, df_1 = query_table()
df_0
parent_path name
0 /dm/s3/endpoint/dm02/r1/d4/ r1-f5141
1 /dm/s3/endpoint/dm02/r48/d3/ r48-f3212
2 /dm/s3/endpoint/dm01/r57/d0/ r57-f3403
3 /scotth/shb1/X1kib/r23/d3/ r23-f30
4 /dm/s3/endpoint/dm04/r13/d1/ r13-f2549
... ... ...
84594047 /sven/jan_test/elbencho/r10/d0/ r10-f6
84594048 /sven/jan_test/elbencho/r21/d0/ r21-f4
84594049 /sven/jan_test/elbencho/r74/d0/ r74-f7
84594050 /sven/jan_test/elbencho/r87/d0/ r87-f5
84594051 /sven/jan_test/elbencho/r64/d0/ r64-f3

84594052 rows × 2 columns

df_1
parent_path name
paths_0 = set(df_0['parent_path'] + df_0['name'])
paths_1 = set(df_1['parent_path'] + df_1['name'])

differences = paths_0 - paths_1
differences
if differences:
    print(f"\n[INFO] Found {len(differences)} files in first snapshot but not in second:")
    for item in sorted(differences):
        print(item)
else:
    print("\n[INFO] No differences found")