Column Management#

Important

This notebook is in the process of being migrated to Vast Data Platform Field Docs. It will probably not run yet.

See also

The Vast DB SDK API Documentation is available here.

Adding a column in VAST is a transactional metadata operation that does not result in any data updates or allocations in main storage. Since VAST-DB is a columnar data store, there is no impact on subsequent inserts or updates but there is also no provision for default values during column addition. Column removals are transactional and will operate similarly to data delete operations. The column is tombstoned and becomes immediately inaccessible. Async tasks then take over and rewrite/unlink data chunks as necessary in main storage. A column removal can imply a lot of background activity, similar to a large delete, relative to the amount of data in that column (sparsity, data size, etc). Note that this asynchronous activity is budgeted for by the system to minimize impact.

Install sdk and connect to Vast DB#

Install vastdb library.

!pip install --quiet vastdb
# Change these variables to reflect your environment, E.g. 
#
# ENDPOINT = 'http://your_vast_endpoint:12345'
# DATABASE_NAME = 'your_db'
# ACCESS_KEY = 'your_access_key'
# SECRET_KEY = 'your_secret_key'
# DATABASE_SCHEMA = 'your_database_schema'
#
# This will be created:
# TABLE_NAME='TEMPORARY_TABLE'

Connect to Vast DB

import vastdb

session = vastdb.connect(
    endpoint=ENDPOINT,
    access=ACCESS_KEY,
    secret=SECRET_KEY)

Column Management API#

columns#

  • Usage: List all columns of a table.

  • Parameters:

    • No parameters

Create a table for our column management examples.

Hide code cell content
import pyarrow as pa
from vastdb.errors import TableExists

# Table schemas (don't confuse with database schema) are created using 
# PyArrow (pa)
ARROW_SCHEMA = pa.schema([('column1', pa.int32()), ('column2', pa.string())])


with session.transaction() as tx:
    bucket = tx.bucket(DATABASE_NAME)

    # first retrieve the schema
    try:
        schema = bucket.schema(name=DATABASE_SCHEMA, fail_if_missing=False)
        print(schema)
    except Exception as e:
        print("Schema doesn't exist:", e)

    if schema:
        try:
            table = schema.create_table(table_name=TABLE_NAME, columns=ARROW_SCHEMA)
            print(f"Table created: {table.name}")
        except TableExists as e:
            print("Couldn't create table because it already exists:", e)
        except Exception as e:
            print("Couldn't create table:", e)

Create a utility function to print columns.

Hide code cell content
def print_columns(database_name, schema_name, table_name):
    with session.transaction() as tx:
        schema = tx.bucket(database_name).schema(schema_name)
        table = schema.table(name=table_name, fail_if_missing=False)
        if table:
            columns = table.columns()
            print(columns)
        else:
            print(f"Couldn't find the table {table_name}.")
print_columns(DATABASE_NAME, DATABASE_SCHEMA, TABLE_NAME)

add_columns#

  • Usage: Add new columns to an existing table.

  • Parameters:

    • new_column (Apache Arrow Schema): Schema of the columns to add.

# verify the table exists
print_columns(DATABASE_NAME, DATABASE_SCHEMA, TABLE_NAME)
# let's create a utility method to check if a column exists
def column_exists(table, column_name):
    if table:
        try:
            # field(column_name) is a pyarrow method
            # https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html#pyarrow.Schema.field
            cols = table.columns()
            cols.field(column_name)
            return True
        except KeyError:
            return False
        except Exception as e:
            raise e
    else:
        return False
import pyarrow as pa

NEW_COLUMN_NAME = 'new_column'
NEW_COLUMNS = pa.schema([(NEW_COLUMN_NAME, pa.int64())])

with session.transaction() as tx:
    schema = tx.bucket(DATABASE_NAME).schema(DATABASE_SCHEMA)
    table = schema.table(name=TABLE_NAME, fail_if_missing=False)
    if table:
        try:
            if column_exists(table, NEW_COLUMN_NAME):
                print("Skipping.  Column already exists.")
            else:
                print(f"Adding column to {table.name}")
                table.add_column(new_column=NEW_COLUMNS)
        except Exception as e:
            print("Couldn't add column - verify that it doesn't already exist")
            print(e)
    else:
        print(f"Couldn't find the table {TABLE_NAME}.")
print_columns(DATABASE_NAME, DATABASE_SCHEMA, TABLE_NAME)

rename_column#

  • Usage: Rename a column in a table.

  • Parameters:

    • current_column_name (str): The name of the column.

    • new_column_name (str, optional): New column name (default is an empty string "").

Alter and rename the Column#

CUR_COLUMN_NAME = 'new_column'
NEW_COLUMN_NAME = 'renamed_new_column'

with session.transaction() as tx:
    schema = tx.bucket(DATABASE_NAME).schema(DATABASE_SCHEMA)
    table = schema.table(name=TABLE_NAME, fail_if_missing=False)
    if table:
        try:
            if column_exists(table, NEW_COLUMN_NAME):
                print("Skipping.  Column already exists.")
            else:
                print(f"Adding column to {table.name}")
                table.rename_column(current_column_name=CUR_COLUMN_NAME, new_column_name=NEW_COLUMN_NAME)
        except Exception as e:
            print("Couldn't add column - verify that it doesn't already exist")
            print(e)
    else:
        print(f"Couldn't find the table {TABLE_NAME}.")

Validate that the name of the column has changed#

print_columns(DATABASE_NAME, DATABASE_SCHEMA, TABLE_NAME)

drop_column#

  • Usage: Remove columns from a table.

  • Parameters:

    • column_to_drop (Apache Arrow Schema): Schema of the columns to remove.

COLUMN_NAME = 'renamed_new_column'
COLUMN_TO_DROP = pa.schema([(COLUMN_NAME, pa.int64())])

with session.transaction() as tx:
    schema = tx.bucket(DATABASE_NAME).schema(DATABASE_SCHEMA)
    table = schema.table(name=TABLE_NAME, fail_if_missing=False)
    if table:
        try:
            if not column_exists(table, COLUMN_NAME):
                print("Skipping.  Column doesn't exists.")
            else:
                print(f"Dropping column from {table.name}")
                table.drop_column(column_to_drop=COLUMN_TO_DROP)
        except Exception as e:
            print("Couldn't add column - verify that it doesn't already exist")
            print(e)
    else:
        print(f"Couldn't find the table {TABLE_NAME}.")