Files
annotations/.cursor/skills/security/evals/security-testing.yaml
T
Oleksandr Bezdieniezhnykh 9e7dc290db Refactor annotation tool from WPF desktop app to .NET API
Replace the WPF desktop application (Azaion.Suite, Azaion.Annotator,
Azaion.Common, Azaion.Inference, Azaion.Loader, Azaion.LoaderUI,
Azaion.Dataset, Azaion.Test) with a standalone .NET Web API in src/.

Made-with: Cursor
2026-03-25 04:40:03 +02:00

790 lines
23 KiB
YAML

# =============================================================================
# AQE Skill Evaluation Test Suite: Security Testing v1.0.0
# =============================================================================
#
# Comprehensive evaluation suite for the security-testing skill per ADR-056.
# Tests OWASP Top 10 2021 detection, severity classification, remediation
# quality, and cross-model consistency.
#
# Schema: .claude/skills/.validation/schemas/skill-eval.schema.json
# Validator: .claude/skills/security-testing/scripts/validate-config.json
#
# Coverage:
# - OWASP A01:2021 - Broken Access Control
# - OWASP A02:2021 - Cryptographic Failures
# - OWASP A03:2021 - Injection (SQL, XSS, Command)
# - OWASP A07:2021 - Identification and Authentication Failures
# - Negative tests (no false positives on secure code)
#
# =============================================================================
skill: security-testing
version: 1.0.0
description: >
Comprehensive evaluation suite for the security-testing skill.
Tests OWASP Top 10 2021 detection capabilities, CWE classification accuracy,
CVSS scoring, severity classification, and remediation quality.
Supports multi-model testing and integrates with ReasoningBank for
continuous improvement.
# =============================================================================
# Multi-Model Configuration
# =============================================================================
models_to_test:
- claude-3.5-sonnet # Primary model (high accuracy expected)
- claude-3-haiku # Fast model (minimum quality threshold)
- gpt-4o # Cross-vendor validation
# =============================================================================
# MCP Integration Configuration
# =============================================================================
mcp_integration:
enabled: true
namespace: skill-validation
# Query existing security patterns before running evals
query_patterns: true
# Track each test outcome for learning feedback loop
track_outcomes: true
# Store successful patterns after evals complete
store_patterns: true
# Share learning with fleet coordinator agents
share_learning: true
# Update quality gate with validation metrics
update_quality_gate: true
# Target agents for learning distribution
target_agents:
- qe-learning-coordinator
- qe-queen-coordinator
- qe-security-scanner
- qe-security-auditor
# =============================================================================
# ReasoningBank Learning Configuration
# =============================================================================
learning:
store_success_patterns: true
store_failure_patterns: true
pattern_ttl_days: 90
min_confidence_to_store: 0.7
cross_model_comparison: true
# =============================================================================
# Result Format Configuration
# =============================================================================
result_format:
json_output: true
markdown_report: true
include_raw_output: false
include_timing: true
include_token_usage: true
# =============================================================================
# Environment Setup
# =============================================================================
setup:
required_tools:
- jq # JSON parsing (required)
- npm # Dependency audit (optional but recommended)
environment_variables:
SECURITY_SCAN_DEPTH: "deep"
OWASP_ENABLED: "true"
SEVERITY_THRESHOLD: "medium"
fixtures:
- name: vulnerable_express_app
path: fixtures/vulnerable-express-app.js
content: |
const express = require('express');
const app = express();
// SQL Injection vulnerability
app.get('/user', (req, res) => {
const query = `SELECT * FROM users WHERE id = ${req.params.id}`;
db.query(query);
});
// XSS vulnerability
app.get('/profile', (req, res) => {
res.send(`<h1>Hello ${req.query.name}</h1>`);
});
// Path Traversal vulnerability
app.get('/file', (req, res) => {
const path = './uploads/' + req.query.filename;
res.sendFile(path);
});
# =============================================================================
# TEST CASES
# =============================================================================
test_cases:
# ---------------------------------------------------------------------------
# CATEGORY: SQL Injection (OWASP A03:2021)
# ---------------------------------------------------------------------------
- id: tc001_sql_injection_string_concat
description: "Detect SQL injection via string concatenation in Node.js"
category: injection
priority: critical
input:
code: |
const express = require('express');
const mysql = require('mysql');
const app = express();
app.get('/api/users', (req, res) => {
const userId = req.params.id;
const query = `SELECT * FROM users WHERE id = ${userId}`;
db.query(query, (err, results) => {
res.json(results);
});
});
context:
language: javascript
framework: express
environment: production
expected_output:
must_contain:
- "SQL injection"
- "parameterized"
must_not_contain:
- "no vulnerabilities"
- "secure"
must_match_regex:
- "CWE-89|CWE-564"
- "A03:20[21][0-9]"
severity_classification: critical
finding_count:
min: 1
max: 3
recommendation_count:
min: 1
validation:
schema_check: true
keyword_match_threshold: 0.8
reasoning_quality_min: 0.7
grading_rubric:
completeness: 0.3
accuracy: 0.5
actionability: 0.2
timeout_ms: 30000
- id: tc002_sql_injection_parameterized_safe
description: "Verify parameterized queries are NOT flagged as vulnerable"
category: injection
priority: high
input:
code: |
app.get('/api/users', (req, res) => {
const userId = parseInt(req.params.id, 10);
db.query('SELECT * FROM users WHERE id = ?', [userId], (err, results) => {
res.json(results);
});
});
context:
language: javascript
framework: express
expected_output:
must_contain:
- "parameterized"
- "secure"
must_not_contain:
- "SQL injection"
- "critical"
- "vulnerable"
severity_classification: info
finding_count:
max: 1
validation:
schema_check: true
keyword_match_threshold: 0.7
allow_partial: true
# ---------------------------------------------------------------------------
# CATEGORY: Cross-Site Scripting (OWASP A03:2021)
# ---------------------------------------------------------------------------
- id: tc003_xss_reflected_html_output
description: "Detect reflected XSS in unescaped HTML output"
category: injection
priority: critical
input:
code: |
app.get('/profile', (req, res) => {
const name = req.query.name;
res.send(`
<html>
<body>
<h1>Welcome, ${name}!</h1>
<p>Your profile has been loaded.</p>
</body>
</html>
`);
});
context:
language: javascript
framework: express
expected_output:
must_contain:
- "XSS"
- "cross-site scripting"
- "sanitize"
- "escape"
must_match_regex:
- "CWE-79"
severity_classification: high
finding_count:
min: 1
validation:
schema_check: true
keyword_match_threshold: 0.8
reasoning_quality_min: 0.75
- id: tc004_xss_dom_based_innerhtml
description: "Detect DOM-based XSS via innerHTML assignment"
category: injection
priority: high
input:
code: |
// Client-side JavaScript
const params = new URLSearchParams(window.location.search);
const message = params.get('msg');
document.getElementById('output').innerHTML = message;
context:
language: javascript
framework: vanilla
environment: production
expected_output:
must_contain:
- "DOM"
- "XSS"
- "innerHTML"
- "textContent"
must_match_regex:
- "CWE-79"
severity_classification: high
validation:
schema_check: true
keyword_match_threshold: 0.7
# ---------------------------------------------------------------------------
# CATEGORY: Authentication Failures (OWASP A07:2021)
# ---------------------------------------------------------------------------
- id: tc005_hardcoded_credentials
description: "Detect hardcoded credentials and API keys"
category: authentication
priority: critical
input:
code: |
const ADMIN_PASSWORD = 'admin123';
const API_KEY = 'sk-1234567890abcdef';
const DATABASE_URL = 'postgres://admin:password123@localhost/db';
app.post('/login', (req, res) => {
if (req.body.password === ADMIN_PASSWORD) {
req.session.isAdmin = true;
res.send('Login successful');
}
});
context:
language: javascript
framework: express
expected_output:
must_contain:
- "hardcoded"
- "credentials"
- "secret"
- "environment variable"
must_match_regex:
- "CWE-798|CWE-259"
severity_classification: critical
finding_count:
min: 2
validation:
schema_check: true
keyword_match_threshold: 0.8
reasoning_quality_min: 0.8
- id: tc006_weak_password_hashing
description: "Detect weak password hashing algorithms (MD5, SHA1)"
category: authentication
priority: high
input:
code: |
const crypto = require('crypto');
function hashPassword(password) {
return crypto.createHash('md5').update(password).digest('hex');
}
function verifyPassword(password, hash) {
return hashPassword(password) === hash;
}
context:
language: javascript
framework: nodejs
expected_output:
must_contain:
- "MD5"
- "weak"
- "bcrypt"
- "argon2"
must_match_regex:
- "CWE-327|CWE-328|CWE-916"
severity_classification: high
finding_count:
min: 1
validation:
schema_check: true
keyword_match_threshold: 0.8
# ---------------------------------------------------------------------------
# CATEGORY: Broken Access Control (OWASP A01:2021)
# ---------------------------------------------------------------------------
- id: tc007_idor_missing_authorization
description: "Detect IDOR vulnerability with missing authorization check"
category: authorization
priority: critical
input:
code: |
app.get('/api/users/:id/profile', (req, res) => {
// No authorization check - any user can access any profile
const userId = req.params.id;
db.query('SELECT * FROM profiles WHERE user_id = ?', [userId])
.then(profile => res.json(profile));
});
app.delete('/api/users/:id', (req, res) => {
// No check if requesting user owns this account
db.query('DELETE FROM users WHERE id = ?', [req.params.id]);
res.send('User deleted');
});
context:
language: javascript
framework: express
expected_output:
must_contain:
- "authorization"
- "access control"
- "IDOR"
- "ownership"
must_match_regex:
- "CWE-639|CWE-284|CWE-862"
- "A01:2021"
severity_classification: critical
validation:
schema_check: true
keyword_match_threshold: 0.7
# ---------------------------------------------------------------------------
# CATEGORY: Cryptographic Failures (OWASP A02:2021)
# ---------------------------------------------------------------------------
- id: tc008_weak_encryption_des
description: "Detect use of weak encryption algorithms (DES, RC4)"
category: cryptography
priority: high
input:
code: |
const crypto = require('crypto');
function encryptData(data, key) {
const cipher = crypto.createCipher('des', key);
return cipher.update(data, 'utf8', 'hex') + cipher.final('hex');
}
function decryptData(data, key) {
const decipher = crypto.createDecipher('des', key);
return decipher.update(data, 'hex', 'utf8') + decipher.final('utf8');
}
context:
language: javascript
framework: nodejs
expected_output:
must_contain:
- "DES"
- "weak"
- "deprecated"
- "AES"
must_match_regex:
- "CWE-327|CWE-328"
- "A02:2021"
severity_classification: high
validation:
schema_check: true
keyword_match_threshold: 0.7
- id: tc009_plaintext_password_storage
description: "Detect plaintext password storage"
category: cryptography
priority: critical
input:
code: |
class User {
constructor(email, password) {
this.email = email;
this.password = password; // Stored in plaintext!
}
save() {
db.query('INSERT INTO users (email, password) VALUES (?, ?)',
[this.email, this.password]);
}
}
context:
language: javascript
framework: nodejs
expected_output:
must_contain:
- "plaintext"
- "password"
- "hash"
- "bcrypt"
must_match_regex:
- "CWE-256|CWE-312"
- "A02:2021"
severity_classification: critical
validation:
schema_check: true
keyword_match_threshold: 0.8
# ---------------------------------------------------------------------------
# CATEGORY: Path Traversal (Related to A01:2021)
# ---------------------------------------------------------------------------
- id: tc010_path_traversal_file_access
description: "Detect path traversal vulnerability in file access"
category: injection
priority: critical
input:
code: |
const fs = require('fs');
app.get('/download', (req, res) => {
const filename = req.query.file;
const filepath = './uploads/' + filename;
res.sendFile(filepath);
});
app.get('/read', (req, res) => {
const content = fs.readFileSync('./data/' + req.params.name);
res.send(content);
});
context:
language: javascript
framework: express
expected_output:
must_contain:
- "path traversal"
- "directory traversal"
- "../"
- "sanitize"
must_match_regex:
- "CWE-22|CWE-23"
severity_classification: critical
validation:
schema_check: true
keyword_match_threshold: 0.7
# ---------------------------------------------------------------------------
# CATEGORY: Negative Tests (No False Positives)
# ---------------------------------------------------------------------------
- id: tc011_secure_code_no_false_positives
description: "Verify secure code is NOT flagged as vulnerable"
category: negative
priority: critical
input:
code: |
const express = require('express');
const helmet = require('helmet');
const rateLimit = require('express-rate-limit');
const bcrypt = require('bcrypt');
const validator = require('validator');
const app = express();
app.use(helmet());
app.use(rateLimit({ windowMs: 15 * 60 * 1000, max: 100 }));
app.post('/api/users', async (req, res) => {
const { email, password } = req.body;
// Input validation
if (!validator.isEmail(email)) {
return res.status(400).json({ error: 'Invalid email' });
}
// Secure password hashing
const hashedPassword = await bcrypt.hash(password, 12);
// Parameterized query
await db.query(
'INSERT INTO users (email, password) VALUES ($1, $2)',
[email, hashedPassword]
);
res.status(201).json({ message: 'User created' });
});
context:
language: javascript
framework: express
environment: production
expected_output:
must_contain:
- "secure"
- "best practice"
must_not_contain:
- "SQL injection"
- "XSS"
- "critical vulnerability"
- "high severity"
finding_count:
max: 2 # Allow informational findings only
validation:
schema_check: true
keyword_match_threshold: 0.6
allow_partial: true
- id: tc012_secure_auth_implementation
description: "Verify secure authentication is recognized as safe"
category: negative
priority: high
input:
code: |
const bcrypt = require('bcrypt');
const jwt = require('jsonwebtoken');
async function login(email, password) {
const user = await User.findByEmail(email);
if (!user) {
return { error: 'Invalid credentials' };
}
const match = await bcrypt.compare(password, user.passwordHash);
if (!match) {
return { error: 'Invalid credentials' };
}
const token = jwt.sign(
{ userId: user.id },
process.env.JWT_SECRET,
{ expiresIn: '1h' }
);
return { token };
}
context:
language: javascript
framework: nodejs
expected_output:
must_contain:
- "bcrypt"
- "jwt"
- "secure"
must_not_contain:
- "vulnerable"
- "critical"
- "hardcoded"
severity_classification: info
validation:
schema_check: true
allow_partial: true
# ---------------------------------------------------------------------------
# CATEGORY: Python Security (Multi-language Support)
# ---------------------------------------------------------------------------
- id: tc013_python_sql_injection
description: "Detect SQL injection in Python Flask application"
category: injection
priority: critical
input:
code: |
from flask import Flask, request
import sqlite3
app = Flask(__name__)
@app.route('/user')
def get_user():
user_id = request.args.get('id')
conn = sqlite3.connect('users.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM users WHERE id = {user_id}")
return str(cursor.fetchone())
context:
language: python
framework: flask
expected_output:
must_contain:
- "SQL injection"
- "parameterized"
- "f-string"
must_match_regex:
- "CWE-89"
severity_classification: critical
finding_count:
min: 1
validation:
schema_check: true
keyword_match_threshold: 0.7
- id: tc014_python_ssti_jinja
description: "Detect Server-Side Template Injection in Jinja2"
category: injection
priority: critical
input:
code: |
from flask import Flask, request, render_template_string
app = Flask(__name__)
@app.route('/render')
def render():
template = request.args.get('template')
return render_template_string(template)
context:
language: python
framework: flask
expected_output:
must_contain:
- "SSTI"
- "template injection"
- "render_template_string"
- "Jinja2"
must_match_regex:
- "CWE-94|CWE-1336"
severity_classification: critical
validation:
schema_check: true
keyword_match_threshold: 0.7
- id: tc015_python_pickle_deserialization
description: "Detect insecure deserialization with pickle"
category: injection
priority: critical
input:
code: |
import pickle
from flask import Flask, request
app = Flask(__name__)
@app.route('/load')
def load_data():
data = request.get_data()
obj = pickle.loads(data)
return str(obj)
context:
language: python
framework: flask
expected_output:
must_contain:
- "pickle"
- "deserialization"
- "untrusted"
- "RCE"
must_match_regex:
- "CWE-502"
- "A08:2021"
severity_classification: critical
validation:
schema_check: true
keyword_match_threshold: 0.7
# =============================================================================
# SUCCESS CRITERIA
# =============================================================================
success_criteria:
# Overall pass rate (90% of tests must pass)
pass_rate: 0.9
# Critical tests must ALL pass (100%)
critical_pass_rate: 1.0
# Average reasoning quality score
avg_reasoning_quality: 0.75
# Maximum suite execution time (5 minutes)
max_execution_time_ms: 300000
# Maximum variance between model results (15%)
cross_model_variance: 0.15
# =============================================================================
# METADATA
# =============================================================================
metadata:
author: "qe-security-auditor"
created: "2026-02-02"
last_updated: "2026-02-02"
coverage_target: >
OWASP Top 10 2021: A01 (Broken Access Control), A02 (Cryptographic Failures),
A03 (Injection - SQL, XSS, SSTI, Command), A07 (Authentication Failures),
A08 (Software Integrity - Deserialization). Covers JavaScript/Node.js
Express apps and Python Flask apps. 15 test cases with 90% pass rate
requirement and 100% critical pass rate.