Data Validation Examples
This page shows practical examples of data validation with Pavise.
CSV Data Validation
Validate data loaded from CSV files:
from typing import Protocol, Annotated
from pavise.pandas import DataFrame
from pavise.exceptions import ValidationError
from pavise.validators import Range, Regex
import pandas as pd
class UserDataSchema(Protocol):
user_id: int
name: str
age: Annotated[int, Range(0, 150)]
email: Annotated[str, Regex(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')]
# Load from CSV
raw_df = pd.read_csv("users.csv")
# Validate
try:
validated_df = DataFrame[UserDataSchema](raw_df)
print("Data is valid!")
except ValidationError as e:
print(f"Validation failed: {e}")
Database Query Validation
Validate data from database queries:
from typing import Protocol, Annotated, Literal
from pavise.pandas import DataFrame
from pavise.validators import Range
import pandas as pd
import sqlalchemy
import datetime
class OrderSchema(Protocol):
order_id: int
customer_id: int
status: Literal["pending", "processing", "shipped", "delivered"]
amount: Annotated[float, Range(0.0, float('inf'))]
created_at: datetime.datetime
# Query database
engine = sqlalchemy.create_engine("sqlite:///orders.db")
query = "SELECT * FROM orders WHERE created_at > '2024-01-01'"
raw_df = pd.read_sql(query, engine)
# Validate
validated_df = DataFrame[OrderSchema](raw_df)
API Response Validation
Validate data from external APIs:
from typing import Protocol, Annotated
from pavise.polars import DataFrame
from pavise.exceptions import ValidationError
from pavise.validators import Unique, Range
import polars as pl
import requests
class APIProductSchema(Protocol):
id: Annotated[int, Unique()]
name: str
price: Annotated[float, Range(0.0, float('inf'))]
in_stock: bool
# Fetch from API
response = requests.get("https://api.example.com/products")
data = response.json()
# Convert to polars DataFrame
df = pl.DataFrame(data)
# Validate with strict mode (no extra fields allowed)
try:
validated_df = DataFrame[APIProductSchema](df, strict=True)
except ValidationError as e:
print(f"API contract violation: {e}")
ETL Pipeline Validation
Validate data at different stages of an ETL pipeline:
from typing import Protocol, Annotated
from pavise.pandas import DataFrame
from pavise.validators import Range, MinLen
import pandas as pd
# Raw data schema
class RawDataSchema(Protocol):
id: int
raw_value: str
# Cleaned data schema
class CleanedDataSchema(Protocol):
id: int
value: Annotated[str, MinLen(1)]
normalized_value: Annotated[float, Range(0.0, 1.0)]
# Extract
raw_df = pd.read_csv("raw_data.csv")
validated_raw = DataFrame[RawDataSchema](raw_df)
# Transform
def clean_data(df: DataFrame[RawDataSchema]) -> DataFrame[CleanedDataSchema]:
cleaned = df.copy()
cleaned["value"] = cleaned["raw_value"].str.strip()
cleaned["normalized_value"] = (
cleaned["value"].str.len() / cleaned["value"].str.len().max()
)
cleaned = cleaned.drop(columns=["raw_value"])
return DataFrame[CleanedDataSchema](cleaned)
# Load
cleaned_df = clean_data(validated_raw)
cleaned_df.to_csv("cleaned_data.csv", index=False)
Handling Validation Errors
Gracefully handle validation errors in production:
from typing import Protocol
from pavise.pandas import DataFrame
from pavise.exceptions import ValidationError
import pandas as pd
import logging
logger = logging.getLogger(__name__)
class TransactionSchema(Protocol):
transaction_id: int
amount: float
timestamp: datetime.datetime
def process_transactions(file_path: str) -> None:
try:
raw_df = pd.read_csv(file_path)
validated_df = DataFrame[TransactionSchema](raw_df)
# Process valid data
process_valid_transactions(validated_df)
except ValidationError as e:
logger.error(f"Validation failed in {file_path}: {e}")
# Handle validation error - maybe clean and retry
except Exception as e:
logger.error(f"Unexpected error in {file_path}: {e}")
raise
def process_valid_transactions(df: DataFrame[TransactionSchema]) -> None:
# Process validated data
pass