Catalog - Python SDK Examples

Catalog - Python SDK Examples#

!pip install --quiet vastdb

S3 Metadata (Tags)#

! s3cmd_configure.sh # custom script to setup s3cmd connection detail

! s3cmd ls s3://csnow-bucket/nyt/ | awk 'NR<=10' # just show a few rows (10)

2025-01-22 14:57    481274128  s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
2024-12-09 15:54    458846485  s3://csnow-bucket/nyt/yellow_tripdata_2009-02.parquet
2024-12-09 15:55    498225013  s3://csnow-bucket/nyt/yellow_tripdata_2009-03.parquet
2024-12-09 15:56    494119681  s3://csnow-bucket/nyt/yellow_tripdata_2009-04.parquet
2024-12-09 15:56    515203538  s3://csnow-bucket/nyt/yellow_tripdata_2009-05.parquet
2024-12-09 15:57    491071155  s3://csnow-bucket/nyt/yellow_tripdata_2009-06.parquet
2024-12-09 15:57    471955931  s3://csnow-bucket/nyt/yellow_tripdata_2009-07.parquet
2024-12-09 15:58    477035993  s3://csnow-bucket/nyt/yellow_tripdata_2009-08.parquet
2024-12-09 15:58    488439674  s3://csnow-bucket/nyt/yellow_tripdata_2009-09.parquet
2024-12-09 15:59    543738743  s3://csnow-bucket/nyt/yellow_tripdata_2009-10.parquet

Remove previous user tags#

! s3cmd modify --remove-header='x-amz-meta-foo' s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]

! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:43 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL

Add user tags#

! s3cmd modify --add-header=x-amz-meta-foo:bar s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]

! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:45 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL
   x-amz-meta-foo: bar

Create VAST DB Catalog Session#

import os

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")

---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
---

import pyarrow as pa
import vastdb

session = vastdb.connect(
    endpoint=VASTDB_ENDPOINT,
    access=VASTDB_ACCESS_KEY,
    secret=VASTDB_SECRET_KEY)

field_names = ['element_type'] # Only need the element_type field for counting

with session.transaction() as tx:

    # we will work with patable functionality
    pa_table = tx.catalog().select(columns=field_names).read_all()

    # you can also work with pandas df, but pa_table is probably quicker
    pdf = pa_table.to_pandas()

How many elements are in the catalog#

total_elements = pa_table.num_rows
print(f"Total elements in the catalog: {total_elements}")

Total elements in the catalog: 84836469

How many files/objects?#

file_count = pa_table.filter(pa.compute.field("element_type") == "FILE").num_rows
print(f"Number of files/objects: {file_count}")

Number of files/objects: 84692655

How many directories?#

dir_count = pa_table.filter(pa.compute.field("element_type") == "DIR").num_rows
print(f"Number of directories: {dir_count}")

Number of directories: 143671

How many Database tables?#

dir_count = pa_table.filter(pa.compute.field("element_type") == "TABLE").num_rows
print(f"Number of directories: {dir_count}")

Number of directories: 25

What are all of the elements on my system anyway?#

group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([])
group_counts.to_pandas()

	element_type
0	FILE
1	DIR
2	SYMLINK
3	SCHEMA
4	TABLE

Alternative count approach#

group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([([], "count_all")])
group_counts.to_pandas()

	element_type	count_all
0	FILE	84692655
1	DIR	143671
2	SYMLINK	106
3	TABLE	25
4	SCHEMA	12

Simplified example of count of elements returned from parallel execution#

The query_iterator iteratively executes a query on a database table, returning results in chunks as PyArrow RecordBatches, enabling efficient handling of large datasets by processing data in smaller, manageable segments. Simplified example of count of elements returned from parallel execution.

def query_and_count_elements(session, field_names):

    with session.transaction() as tx:
        # batch reader
        reader = tx.catalog().select(columns=field_names)

        elements_count = 0
        for record_batch in reader:
            elements_count += record_batch.num_rows

        return elements_count

# Query Parameters
field_names = ['element_type']  # Only need the element_type field for counting

# Perform the query
total_elements = query_and_count_elements(session, field_names)
print(f"Total elements in the catalog: {total_elements}")

Total elements in the catalog: 84836469

Simple Filtering#

first filter by pushing down to the DB predicates to search only for changes since today

import ibis
import time
import pyarrow.compute as pc
from datetime import datetime
from ibis import _


# today's date - should pick
date_str = time.strftime('%Y-%m-%d')

# Convert date string to epoch timestamp (in seconds)
epoch_seconds = int(time.mktime(time.strptime(date_str, '%Y-%m-%d')))

# Create an Ibis literal with epoch time and the correct data type
predicate = (_.mtime >= ibis.literal(epoch_seconds, type='timestamp'))

field_names = ['name', 'creation_time', 'uid', 'owner_name', 'size', 'user_metadata', 'user_tags']

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

	name	creation_time	uid	owner_name	size	user_metadata	user_tags
0	file_15624.txt	2025-01-22 14:45:21.141552984	5069	John Gorski	20480	None	None
1	file_2386.txt	2025-01-22 14:46:03.517637761	5069	John Gorski	20480	None	None
2	file_21513.txt	2025-01-22 14:45:35.738343786	5069	John Gorski	20480	None	None
3	subfolder_1	2025-01-22 14:45:52.032227101	5069	John Gorski	4096	None	None
4	file_14682.txt	2025-01-22 14:46:06.120785539	5069	John Gorski	20480	None	None
...	...	...	...	...	...	...	...
84177	file_27536.txt	2025-01-22 15:21:04.826954108	5069	John Gorski	20480	None	None
84178	file_15986.txt	2025-01-22 15:20:28.161570268	5069	John Gorski	20480	None	None
84179	file_29581.txt	2025-01-22 15:19:00.046412901	5069	John Gorski	20480	None	None
84180	file_8016.txt	2025-01-22 15:19:17.964053479	5069	John Gorski	20480	None	None
84181	file_9210.txt	2025-01-22 15:20:16.693873580	5069	John Gorski	20480	None	None

84182 rows × 7 columns

then post filter the returned dataset to search for user_meta data not null
we should see the file we updated using s3cmd

df[df['user_tags'].notnull()]

	name	creation_time	uid	owner_name	size	user_metadata	user_tags
24735	yellow_tripdata_2009-01.parquet	2024-12-09 15:54:27.143172336	5102	Chris Snow	481274128	[(foo, bar)]	[(key1, value1), (key2, value2)]

Query for Specific File Types Across Different Users:#

field_names = ['uid', 'owner_name', 'element_type']
predicate = \
    ((_.element_type == 'FILE') | (_.element_type == 'TABLE') | (_.element_type == 'DIR')) & \
    ((_.uid == 500) | (_.uid == 1000))

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

	uid	owner_name	element_type
0	1000	vastdata	FILE
1	1000	vastdata	FILE
2	1000	vastdata	FILE
3	1000	vastdata	FILE
4	1000	vastdata	FILE
...	...	...	...
40386704	1000	vastdata	FILE
40386705	1000	vastdata	FILE
40386706	1000	vastdata	FILE
40386707	1000	vastdata	FILE
40386708	1000	vastdata	FILE

40386709 rows × 3 columns

Query for Objects Based on User and Specific Extensions#

field_names = ['uid', 'extension', 'size']
predicate = \
    ((_.extension == 'log') | (_.extension == 'ldb')) & \
    ((_.uid == 555) | (_.uid == 1000))

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

	uid	extension	size
0	1000	log	69641674
1	1000	log	115
2	1000	log	60046685
3	1000	log	115
4	1000	log	39745
...	...	...	...
52808	1000	log	0
52809	1000	log	0
52810	1000	log	0
52811	1000	log	72921728
52812	1000	log	39765

52813 rows × 3 columns

Query for Specific File Types with Size Constraints#

field_names = ['element_type', 'size', 'name']
predicate = \
    (_.element_type == 'FILE') & \
    ((_.size > 50000) & (_.size < 1000000)) # size between 50 KB and 1 MB

with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

	element_type	size	name
0	FILE	128609	NA19191.seg.called.merged
1	FILE	182032	Theilman.mp4
2	FILE	172816	out-of-townish.exr
3	FILE	231184	beflagged.json
4	FILE	401168	headgate.xml
...	...	...	...
9830564	FILE	131072	r96-f243
9830565	FILE	131072	r168-f156
9830566	FILE	131072	r168-f222
9830567	FILE	131072	r108-f75
9830568	FILE	131072	r229-f47

9830569 rows × 3 columns

Query for Large TABLE Objects by Specific Users#

field_names = ['uid', 'owner_name', 'size', 'element_type']
predicate = \
    (_.uid == 5102) & \
    (_.element_type == 'TABLE') & \
    (_.size > 10000000) # greater than 10 MB


with session.transaction() as tx:
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

	uid	owner_name	size	element_type
0	5102	Chris Snow	295359979974	TABLE
1	5102	Chris Snow	30638775201	TABLE
2	5102	Chris Snow	225006991	TABLE
3	5102	Chris Snow	193123235385	TABLE
4	5102	Chris Snow	144481791	TABLE

Timestamp Filtering#

Query by birthdate: VAST uses a “creation_time” column to indicate when a new element is created: This will output all objects linked after noon on September 1st. It will not output files that have been moved to a new path.

NOTE : Same method can be applied for acces-time (atime), modification-time (mtime) & metadata-update-times (ctime).

import pandas as pd
from ibis import _

# increase display width
pd.set_option('display.max_colwidth', 1000)  # Limit string length to 1000 characters
pd.set_option('display.width', 1000)  # Set the total display width to 1000 characters

# Original timestamp filtering
timestamp_birthdate = pd.Timestamp('2025-01-01 12:00:01')

# Modern predicate approach
predicate = (_.creation_time > timestamp_birthdate) 

field_names = ['creation_time', 'parent_path', 'name']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    
    df = table.to_pandas()
    
# Add full path column
df['full_path'] = df['parent_path'] + df['name']

# Display results
print("Objects created after 2025-01-01 12:00:01:")
df['full_path']

Objects created after 2025-01-01 12:00:01:

0                                                                                                                                                      /lukes3/small_16k/r96/d0/r96-f5
1                                                                                                                                                    /lukes3/small_16k/r44/d3/r44-f115
2                                                                                                                                                    /lukes3/small_16k/r14/d9/r14-f195
3                                                                                                                                                    /lukes3/small_16k/r32/d8/r32-f212
4                                                                                                                                                    /lukes3/small_16k/r94/d0/r94-f253
                                                                                              ...                                                                                     
4490756    /venable/deep_filesystem/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/file_30544.txt
4490757    /venable/deep_filesystem/subfolder_2/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_1/file_25566.txt
4490758    /venable/deep_filesystem/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_1/subfolder_2/file_31592.txt
4490759     /venable/deep_filesystem/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_1/subfolder_1/subfolder_1/subfolder_1/subfolder_2/subfolder_1/file_2372.txt
4490760    /venable/deep_filesystem/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_1/subfolder_2/subfolder_2/subfolder_2/subfolder_2/file_14637.txt
Name: full_path, Length: 4490761, dtype: object

Reporting#

Simple queries to tell you basic statistics on a section of the namespace Report statistics on parts of the namespace - Summarizing files of a certain type (FILE), belonging to a specific user (uid=555), and located in a certain path (/parquet-files-bucket).

The previous query returned:

Objects created after 2025-01-01 12:00:01:

0  /lukes3/small_16k/r96/d0/r96-f5
1  /lukes3/small_16k/r44/d3/r44-f115
...

Let’s use the /lukes3 search path.

import numpy as np

predicate = (_.search_path == '/lukes3') & (_.element_type == 'FILE')
field_names = ['uid', 'used', 'size']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    df = table.to_pandas()

if df.empty:
    print("No data returned from query. Please check filters and field names.")
else:
    formatted_results = {
        'users': f"{df['uid'].nunique():,d}",
        'Files': f"{len(df):,d}",
        'KB_Used': f"{(df['used'].sum() / 1000):,.0f}",
        'Avg_Size_KB': f"{(df['size'].mean() / 1000):,.2f}"
    }
    print("Aggregated Results:")
    print(formatted_results)

Aggregated Results:
{'users': '1', 'Files': '655,360', 'KB_Used': '10,737,418', 'Avg_Size_KB': '16.38'}

Capacity Grouping & Usage report#

Here’s a report on all the users on the system: Get Files across whole system(‘/’), group by owner_name, sum files, total and average size in kilobytes, oldest creation time, and most recent access time for each file owner. Note - display is a IPython function which aggregates results in table format

from IPython.display import display
import pandas as pd
import numpy as np

predicate = (_.element_type == 'FILE') & (_.search_path == '/')
field_names = ['owner_name', 'used', 'size', 'creation_time', 'atime']

with session.transaction() as tx:
    table = tx.catalog().select(
        columns=field_names,
        predicate=predicate
    ).read_all()
    df = table.to_pandas()

pd.options.display.max_columns = None

aggregated_data = df.groupby('owner_name').agg(
    Files=('owner_name', 'count'),
    KB_Used=('used', lambda x: np.sum(x)/1000),
    Avg_Size_KB=('size', lambda x: np.mean(x)/1000),
    Oldest_data=('creation_time', 'min'),
    Last_access=('atime', 'max')
)

# Format numeric columns separately
aggregated_data['Files'] = aggregated_data['Files'].map(lambda x: f"{int(x):,d}")
aggregated_data['KB_Used'] = aggregated_data['KB_Used'].map(lambda x: f"{x:,.0f}")
aggregated_data['Avg_Size_KB'] = aggregated_data['Avg_Size_KB'].map(lambda x: f"{x:,.2f}")

display(aggregated_data)

	Files	KB_Used	Avg_Size_KB	Oldest_data	Last_access
owner_name
0	15,471,800	14,186,299,798	916.91	2024-11-13 01:24:02.260109671	2025-01-22 03:38:06.346023569
65534	161	171,798,692	1,067,072.62	2024-12-19 12:48:08.385283415	2024-12-23 15:19:24.340852233
Chris Snow	291	24,002,206	82,481.81	2024-11-19 12:53:55.710455713	2024-12-09 16:32:40.094875938
David Petika	70	219	3.13	2024-11-13 12:02:56.050907794	2024-12-24 16:11:46.504352451
Edwin Nadar	1	0	0.01	2025-01-16 09:24:52.648151168	2025-01-16 09:24:52.668121034
...	...	...	...	...	...
vperfsanity-james	71,072	54,768,920	770.61	2024-11-12 20:54:25.382017166	2024-11-12 17:13:30.648724050
vperfsanity-sven	126,440	96,829,335	765.81	2024-11-12 21:03:31.871820181	2024-11-12 17:48:59.729557892
wcash-flow	146,852	113,762,566	774.67	2024-11-12 18:41:59.497452192	2024-11-12 18:36:38.979644341
wong.tran-flow	117,288	90,381,655	770.60	2024-11-12 23:38:24.134942342	2024-11-12 18:05:46.249753724
yogi.berra-flow	12,372	9,464,880	765.02	2024-11-12 19:10:25.253624458	2024-11-12 18:12:11.261864289

174 rows × 5 columns

TODO: This example needs to be improved.#

See: snowch/vast-docker-compose-examples#11

Catalog Snapshots Comparisons#

You can access catalog snapshot by navigating the schema space. The most obvious use of snapshot comparisons is delete detection, followed by move detection. Delete detection Query Returns: This script compares the current state with a specific historical snapshot, identifying files present in the current table but not in the snapshot, based on their element_type and search_path. Access to Snapshot: Access to a snapshot works by querying a specific schema directory (representing the snapshot) within the bucket

with session.transaction() as tx:
    snapshots = tx.catalog_snapshots()
    for snapshot in snapshots:
        print(snapshot.name)

vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_13_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_18_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_23_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_28_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_33_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_38_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_43_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_48_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_53_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_21_58_29
vast-big-catalog-bucket/.snapshot/beckie_internal_snap-2025-01-22_22_03_29
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_18_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_19_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_20_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_03_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_08_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_13_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_18_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_23_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_28_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_33_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_38_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_43_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_48_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_53_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_21_58_29_UTC
vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC
62

import re

timestamp_pattern = r"(\d{4}-\d{2}-\d{2}_\d{2}_\d{2}_\d{2})"
timestamps = []

for snapshot in snapshots:
    match = re.search(timestamp_pattern, snapshot.name)
    if match:
        timestamps.append(match.group(1))

# Convert to datetime objects for sorting
datetime_timestamps = [datetime.strptime(ts, "%Y-%m-%d_%H_%M_%S") for ts in timestamps]

# Get the first and last dates
first_date = min(datetime_timestamps)
last_date = max(datetime_timestamps)

print("First snapshot:", first_date, snapshots[0].name)
print("Last snapshot:", last_date, snapshots[-1].name)

First snapshot: 2025-01-22 18:08:29 vast-big-catalog-bucket/.snapshot/bc_table_2025-01-22_21_08_29
Last snapshot: 2025-01-22 22:03:29 vast-big-catalog-bucket/.snapshot/big_catalog_2025-01-22_22_03_29_UTC

import time
from datetime import datetime
from ibis import _

# TODO: move to with tx
date_obj = first_date
epoch_seconds = int(date_obj.timestamp())

def query_table():

    with session.transaction() as tx:
        snapshots = tx.catalog_snapshots()

        predicate_0 = (_.element_type == 'FILE') & \
                    (_.search_path == '/') & \
                    (_.mtime <= ibis.literal(epoch_seconds, type='timestamp'))
        
        predicate_1 = (_.element_type == 'FILE') & \
                    (_.search_path == '/') & \
                    (_.mtime > ibis.literal(epoch_seconds, type='timestamp'))
        
        columns = ['parent_path', 'name']

        if len(snapshots) < 2:
            raise Exception(f"Need at least two snapshots. Found {len(snapshots)}")

        df_0 = tx.catalog(snapshot=snapshots[0]).select(columns=columns, predicate=predicate_0).read_all().to_pandas()
        df_1 = tx.catalog(snapshot=snapshots[-1]).select(columns=columns, predicate=predicate_1).read_all().to_pandas()

    return df_0, df_1

df_0, df_1 = query_table()

df_0

	parent_path	name
0	/dm/s3/endpoint/dm02/r1/d4/	r1-f5141
1	/dm/s3/endpoint/dm02/r48/d3/	r48-f3212
2	/dm/s3/endpoint/dm01/r57/d0/	r57-f3403
3	/scotth/shb1/X1kib/r23/d3/	r23-f30
4	/dm/s3/endpoint/dm04/r13/d1/	r13-f2549
...	...	...
84594047	/sven/jan_test/elbencho/r10/d0/	r10-f6
84594048	/sven/jan_test/elbencho/r21/d0/	r21-f4
84594049	/sven/jan_test/elbencho/r74/d0/	r74-f7
84594050	/sven/jan_test/elbencho/r87/d0/	r87-f5
84594051	/sven/jan_test/elbencho/r64/d0/	r64-f3

84594052 rows × 2 columns

df_1

	parent_path	name

paths_0 = set(df_0['parent_path'] + df_0['name'])
paths_1 = set(df_1['parent_path'] + df_1['name'])

differences = paths_0 - paths_1
differences

if differences:
    print(f"\n[INFO] Found {len(differences)} files in first snapshot but not in second:")
    for item in sorted(differences):
        print(item)
else:
    print("\n[INFO] No differences found")