Testing
The AI Ingredient Scanner includes a comprehensive test suite covering unit tests, integration tests, end-to-end validation, and performance benchmarks.
Test Architectureβ
Running Testsβ
Quick Startβ
# Activate virtual environment
source venv/bin/activate
# Run all tests with coverage
pytest tests/ -v --cov
# Run specific test file
pytest tests/test_agents.py -v
# Run with verbose output
pytest tests/ -v --tb=short
# Run only fast tests (exclude performance)
pytest tests/ -m "not slow"
Test Configurationβ
The project uses pytest.ini for configuration:
[pytest]
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
addopts = -v --cov=config --cov=agents --cov=tools --cov=state --cov=graph --cov=services --cov-report=term-missing --cov-fail-under=70
filterwarnings =
ignore::DeprecationWarning
Test Categoriesβ
Unit Testsβ
Agent Tests (test_agents.py)β
Tests for individual agent functions:
class TestResearchAgent:
"""Tests for Research Agent."""
def test_create_unknown_ingredient(self):
"""Test creation of unknown ingredient record."""
result = _create_unknown_ingredient("mystery_ingredient")
assert result["name"] == "mystery_ingredient"
assert result["source"] == "unknown"
assert result["confidence"] == 0.0
def test_has_research_data_false(self, base_state):
"""Test has_research_data returns False when empty."""
assert has_research_data(base_state) is False
@patch("agents.research.lookup_ingredient")
@patch("agents.research.grounded_ingredient_search")
def test_research_ingredients_fallback(self, mock_search, mock_lookup):
"""Test research falls back to grounded search."""
# Qdrant returns low confidence
mock_lookup.return_value = create_ingredient(confidence=0.5)
# Grounded search provides result
mock_search.return_value = create_ingredient(confidence=0.8)
result = research_ingredients(state)
assert result["ingredient_data"][0]["source"] == "google_search"
Tool Tests (test_tools.py)β
Tests for safety scoring, allergen matching, and search tools:
class TestSafetyScorer:
"""Tests for safety scoring functions."""
def test_calculate_risk_sensitive_skin_fragrance(self):
"""Test risk increases for fragrance with sensitive skin."""
risk = calculate_risk_score(fragrance_ingredient, sensitive_profile)
assert risk == 0.7 # 0.4 base + 0.3 modifier
def test_classify_risk_level_boundaries(self):
"""Test risk classification at boundaries."""
assert classify_risk_level(0.29) == RiskLevel.LOW
assert classify_risk_level(0.30) == RiskLevel.MEDIUM
assert classify_risk_level(0.60) == RiskLevel.HIGH
class TestAllergenMatcher:
"""Tests for allergen matching functions."""
def test_check_allergen_match_positive(self):
"""Test positive allergen match."""
ingredient = create_ingredient(
name="whey protein",
safety_notes="Derived from milk",
)
is_match, allergy = check_allergen_match(ingredient, profile)
assert is_match is True
assert allergy == "milk"
Integration Testsβ
API Tests (test_api.py)β
Tests for REST API endpoints:
class TestHealthEndpoints:
"""Tests for health check endpoints."""
def test_root_endpoint(self, client):
"""Test root endpoint returns OK status."""
response = client.get("/")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_health_endpoint(self, client):
"""Test health endpoint returns healthy status."""
response = client.get("/health")
assert response.status_code == 200
class TestAnalyzeEndpoint:
"""Tests for the /analyze endpoint."""
def test_analyze_missing_ingredients(self, client):
"""Test that empty ingredients returns error."""
response = client.post("/analyze", json={"ingredients": ""})
assert response.status_code == 400
Workflow Tests (test_workflow.py)β
Tests for LangGraph workflow execution:
class TestWorkflowExecution:
"""Tests for workflow execution."""
@patch("agents.research.research_ingredients")
@patch("agents.analysis.analyze_ingredients")
@patch("agents.critic.validate_report")
def test_happy_path(self, mock_critic, mock_analysis, mock_research):
"""Test successful workflow execution."""
mock_research.return_value = state_with_ingredients
mock_analysis.return_value = state_with_report
mock_critic.return_value = state_approved
result = run_analysis(
session_id="test",
product_name="Test",
ingredients=["water"],
allergies=[],
skin_type="normal",
expertise="beginner",
)
assert result["critic_feedback"]["result"] == ValidationResult.APPROVED
End-to-End Tests (test_e2e.py)β
Full pipeline validation with mocked external services:
class TestEndToEndWorkflow:
"""E2E tests for the complete analysis workflow."""
def test_complete_workflow_happy_path(self, mock_llm_responses):
"""Test complete workflow from start to finish."""
result = run_analysis(
session_id="e2e-test-001",
product_name="Test Moisturizer",
ingredients=["Water", "Glycerin", "Vitamin E"],
allergies=[],
skin_type="normal",
expertise="beginner",
)
# Verify workflow completed successfully
assert result.get("error") is None
assert result.get("analysis_report") is not None
assert result.get("critic_feedback") is not None
# Verify routing history shows complete flow
history = result.get("routing_history", [])
assert "research" in history
assert "analysis" in history
assert "critic" in history
def test_complete_workflow_with_allergen(self, mock_llm_responses):
"""Test workflow detects and flags user allergens."""
result = run_analysis(
session_id="e2e-test-002",
product_name="Scented Lotion",
ingredients=["Water", "Fragrance", "Glycerin"],
allergies=["fragrance"],
skin_type="sensitive",
expertise="beginner",
)
# Verify allergen was flagged
report = result.get("analysis_report", {})
assert len(report.get("allergen_warnings", [])) > 0
Performance Tests (test_performance.py)β
Benchmarks and timing validation:
class TestAPIPerformance:
"""Performance tests for API endpoints."""
def test_health_endpoint_response_time(self, client):
"""Health endpoint should respond within 100ms."""
start = time.time()
response = client.get("/health")
elapsed = time.time() - start
assert response.status_code == 200
assert elapsed < 0.1
def test_concurrent_health_checks(self, client):
"""Multiple concurrent health checks should all succeed."""
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(client.get, "/health") for _ in range(10)]
results = [f.result() for f in concurrent.futures.as_completed(futures)]
assert all(r.status_code == 200 for r in results)
Edge Case Tests (test_edge_cases.py)β
Boundary conditions and error handling:
class TestUnicodeAndSpecialCharacters:
"""Tests for Unicode and special character handling."""
def test_unicode_ingredient_names(self, client):
"""Unicode ingredient names should be processed."""
response = client.post("/analyze", json={
"ingredients": "ζ°΄, γ°γͺγ»γͺγ³, λΉνλ―ΌE",
})
assert response.status_code == 200
def test_emoji_in_product_name(self, client):
"""Emoji in product name should be handled."""
response = client.post("/analyze", json={
"product_name": "Glow Cream β¨",
"ingredients": "Water",
})
assert response.status_code == 200
Test Fixturesβ
Shared fixtures in conftest.py:
@pytest.fixture
def base_user_profile() -> UserProfile:
"""Create base user profile with default values."""
return UserProfile(
allergies=[],
skin_type=SkinType.NORMAL,
expertise=ExpertiseLevel.BEGINNER,
)
@pytest.fixture
def mock_llm_services():
"""Mock all external LLM services."""
with patch("agents.research.lookup_ingredient") as mock_lookup, \
patch("agents.research.grounded_ingredient_search") as mock_search, \
patch("agents.analysis._generate_llm_analysis") as mock_analysis, \
patch("agents.critic._run_multi_gate_validation") as mock_critic:
# Default behaviors
mock_lookup.return_value = None
mock_search.side_effect = lambda name: create_test_ingredient(name)
mock_analysis.return_value = "## Analysis\n\nSafe for use."
mock_critic.return_value = {
"completeness_ok": True,
"format_ok": True,
"allergens_ok": True,
"consistency_ok": True,
"tone_ok": True,
"failed_gates": [],
"feedback": "Approved",
}
yield {"lookup": mock_lookup, "search": mock_search, ...}
Coverage Reportβ
Current test coverage: 83% (191 tests passing)
| Module | Coverage |
|---|---|
agents/analysis.py | 94% |
agents/research.py | 92% |
agents/supervisor.py | 88% |
agents/critic.py | 79% |
tools/safety_scorer.py | 100% |
tools/allergen_matcher.py | 100% |
tools/grounded_search.py | 86% |
state/schema.py | 100% |
graph.py | 100% |
config/settings.py | 100% |
Test Markersβ
Custom markers for test categorization:
# pytest.ini or conftest.py
@pytest.fixture
def pytest_configure(config):
config.addinivalue_line("markers", "slow: marks tests as slow")
config.addinivalue_line("markers", "integration: integration tests")
config.addinivalue_line("markers", "e2e: end-to-end tests")
config.addinivalue_line("markers", "performance: performance tests")
Usage:
# Run only E2E tests
pytest -m e2e
# Skip slow tests
pytest -m "not slow"
# Run only performance tests
pytest -m performance
Continuous Integrationβ
GitHub Actions Workflowβ
name: Tests
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pytest pytest-cov
- name: Run tests
env:
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
run: |
pytest tests/ --cov=. --cov-fail-under=70
Best Practicesβ
1. Mock External Servicesβ
Always mock LLM and database calls:
@patch("agents.research.lookup_ingredient")
@patch("agents.research.grounded_ingredient_search")
def test_with_mocks(self, mock_search, mock_lookup):
mock_lookup.return_value = create_test_ingredient("water")
# Test logic here
2. Use Fixtures for Common Setupβ
@pytest.fixture
def base_state() -> WorkflowState:
return WorkflowState(
session_id="test",
product_name="Test",
raw_ingredients=["water"],
user_profile=base_user_profile(),
ingredient_data=[],
analysis_report=None,
critic_feedback=None,
retry_count=0,
routing_history=[],
error=None,
)
3. Test Edge Casesβ
- Empty inputs
- Unicode characters
- Boundary values
- Error conditions
4. Performance Boundariesβ
- API response times < 100ms for health checks
- Batch processing scales sub-linearly
- Concurrent requests handled correctly