diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..dfc5323 --- /dev/null +++ b/.env.example @@ -0,0 +1,46 @@ +# ============================================================================= +# Azaion Admin API — environment variable template +# Copy to `.env` (git-ignored) and fill in real values for your environment. +# Production secrets MUST come from the secret manager, not from a checked-in +# file. See _docs/04_deploy/reports/deploy_status_report.md for the full table. +# ============================================================================= + +# ---------- ASP.NET Core runtime -------------------------------------------- +ASPNETCORE_ENVIRONMENT=Development # Development | Staging | Production +ASPNETCORE_URLS=http://+:8080 # Kestrel bind address inside the container + +# ---------- Database (PostgreSQL on port 4312 in prod, 5432 in test) -------- +# Two roles: reader (read-only) and admin (read/write). See env/db/01_permissions.sql. +ASPNETCORE_ConnectionStrings__AzaionDb=Host=localhost;Port=4312;Database=azaion;Username=azaion_reader;Password=CHANGE_ME +ASPNETCORE_ConnectionStrings__AzaionDbAdmin=Host=localhost;Port=4312;Database=azaion;Username=azaion_admin;Password=CHANGE_ME + +# ---------- JWT (HMAC-SHA256, 4 h TTL) -------------------------------------- +ASPNETCORE_JwtConfig__Secret=CHANGE_ME_TO_A_RANDOM_STRING_AT_LEAST_32_BYTES +ASPNETCORE_JwtConfig__Issuer=AzaionApi +ASPNETCORE_JwtConfig__Audience=Annotators/OrangePi/Admins +ASPNETCORE_JwtConfig__TokenLifetimeHours=4 + +# ---------- Resource storage (filesystem) ----------------------------------- +ASPNETCORE_ResourcesConfig__ResourcesFolder=Content +ASPNETCORE_ResourcesConfig__SuiteInstallerFolder=suite +ASPNETCORE_ResourcesConfig__SuiteStageInstallerFolder=suite-stage + +# ---------- Container build / image label ------------------------------------ +# Injected at build time as --build-arg CI_COMMIT_SHA=… by Woodpecker. +# Local builds may leave it unset (Dockerfile defaults to "unknown"). +# CI_COMMIT_SHA= + +# ---------- Deploy targets (consumed by scripts/, not by the API process) --- +DEPLOY_HOST=admin.azaion.com # SSH target for scripts/deploy.sh +DEPLOY_SSH_USER=root # SSH user on DEPLOY_HOST +DEPLOY_CONTAINER_NAME=azaion.api # Docker container name on the host +DEPLOY_HOST_PORT=4000 # Port published on DEPLOY_HOST (mapped to 8080 in container) +DEPLOY_HOST_CONTENT_DIR=/root/api/content # Bind-mount for resource files +DEPLOY_HOST_LOGS_DIR=/root/api/logs # Bind-mount for Serilog rolling files + +# ---------- Container registry ---------------------------------------------- +REGISTRY_HOST=docker.azaion.com # Private registry; CI may use localhost:5000 +REGISTRY_IMAGE=azaion/admin # Image path inside REGISTRY_HOST +REGISTRY_TAG=dev-arm # main→arm, stage→stage-arm, dev→dev-arm +REGISTRY_USER= # CI / scripts only — leave empty in dev .env +REGISTRY_TOKEN= # CI / scripts only — leave empty in dev .env diff --git a/.woodpecker/01-test.yml b/.woodpecker/01-test.yml index 486f23a..1d70f79 100644 --- a/.woodpecker/01-test.yml +++ b/.woodpecker/01-test.yml @@ -2,18 +2,53 @@ when: event: [push, pull_request, manual] branch: [dev, stage, main] +matrix: + include: + - PLATFORM: arm64 + TAG_SUFFIX: arm + # - PLATFORM: amd64 + # TAG_SUFFIX: amd + labels: - platform: arm64 + platform: ${PLATFORM} steps: + - name: lint-format + image: mcr.microsoft.com/dotnet/sdk:10.0 + commands: + - dotnet format Azaion.AdminApi.sln --verify-no-changes --verbosity diagnostic + - name: unit-tests image: mcr.microsoft.com/dotnet/sdk:10.0 commands: - dotnet restore Azaion.AdminApi.sln - dotnet test Azaion.AdminApi.sln --no-restore --configuration Release --logger "console;verbosity=normal" --logger "trx;LogFileName=test-results.trx" --results-directory /app/test-results - - name: e2e-tests + - name: deps-audit image: mcr.microsoft.com/dotnet/sdk:10.0 commands: - - dotnet restore e2e/Azaion.E2E/Azaion.E2E.csproj - - dotnet test e2e/Azaion.E2E/Azaion.E2E.csproj --no-restore --configuration Release --logger "console;verbosity=normal" --logger "trx;LogFileName=e2e-results.trx" --results-directory /app/test-results + # Security audit recommendation 13: fail the build on any High or Critical + # vulnerable dependency. The grep returns non-zero when no match is found, + # which we want to treat as success — hence the explicit inversion. + - dotnet restore Azaion.AdminApi.sln + - dotnet list Azaion.AdminApi.sln package --vulnerable --include-transitive 2>&1 | tee deps-audit.log + - if grep -E "^\s+>\s+\S+\s+\S+\s+\S+\s+(High|Critical)\s*$" deps-audit.log; then echo "Vulnerable High/Critical dependency found"; exit 1; fi + + - name: e2e-tests + image: docker + commands: + # Mirrors scripts/run-tests.sh: drop volumes from any prior run so the DB + # init scripts re-run on a clean data dir, then run compose to completion. + - docker compose -f docker-compose.test.yml down -v --remove-orphans + - docker compose -f docker-compose.test.yml up --build --abort-on-container-exit --exit-code-from e2e-consumer + volumes: + - /var/run/docker.sock:/var/run/docker.sock + + - name: e2e-cleanup + image: docker + when: + status: [success, failure] + commands: + - docker compose -f docker-compose.test.yml down -v --remove-orphans + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/.woodpecker/02-build-push.yml b/.woodpecker/02-build-push.yml index 7600065..1b7f61d 100644 --- a/.woodpecker/02-build-push.yml +++ b/.woodpecker/02-build-push.yml @@ -29,15 +29,25 @@ steps: from_secret: registry_token commands: - echo "$REGISTRY_TOKEN" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin - - export TAG=${CI_COMMIT_BRANCH}-${TAG_SUFFIX} + - export BRANCH_TAG=${CI_COMMIT_BRANCH}-${TAG_SUFFIX} + # 12-char SHA prefix is human-readable while still globally-unique inside + # the repo. Pair with TAG_SUFFIX so multi-arch entries don't collide. + - export SHA_TAG=$(echo "$CI_COMMIT_SHA" | cut -c1-12)-${TAG_SUFFIX} - export BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ) + - export IMAGE=$REGISTRY_HOST/azaion/admin - | docker build -f Dockerfile \ --build-arg CI_COMMIT_SHA=$CI_COMMIT_SHA \ + --build-arg BUILD_DATE=$BUILD_DATE \ --label org.opencontainers.image.revision=$CI_COMMIT_SHA \ --label org.opencontainers.image.created=$BUILD_DATE \ --label org.opencontainers.image.source=$CI_REPO_URL \ - -t $REGISTRY_HOST/azaion/admin:$TAG . - - docker push $REGISTRY_HOST/azaion/admin:$TAG + -t $IMAGE:$BRANCH_TAG \ + -t $IMAGE:$SHA_TAG . + # Mutable branch tag for "give me whatever's latest on dev" pulls. + - docker push $IMAGE:$BRANCH_TAG + # Immutable SHA tag — the deploy scripts pin to this and rollback uses it. + - docker push $IMAGE:$SHA_TAG + - echo "Pushed $IMAGE:$BRANCH_TAG and $IMAGE:$SHA_TAG" volumes: - /var/run/docker.sock:/var/run/docker.sock diff --git a/Azaion.AdminApi.sln b/Azaion.AdminApi.sln index bcea3fe..31ce6ae 100644 --- a/Azaion.AdminApi.sln +++ b/Azaion.AdminApi.sln @@ -12,7 +12,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Docker", "Docker", "{49FBE4 ProjectSection(SolutionItems) = preProject Dockerfile = Dockerfile .dockerignore = .dockerignore - deploy.cmd = deploy.cmd EndProjectSection EndProject Global diff --git a/Azaion.AdminApi/Program.cs b/Azaion.AdminApi/Program.cs index ef3516e..620493d 100644 --- a/Azaion.AdminApi/Program.cs +++ b/Azaion.AdminApi/Program.cs @@ -6,6 +6,7 @@ using Azaion.Common.Entities; using Azaion.Common.Requests; using Azaion.Services; using FluentValidation; +using LinqToDB.Data; using Microsoft.AspNetCore.Authentication.JwtBearer; using Microsoft.AspNetCore.Authorization; using Microsoft.AspNetCore.Mvc; @@ -33,6 +34,17 @@ if (jwtConfig == null || string.IsNullOrEmpty(jwtConfig.Secret)) throw new Exception("Missing configuration section: JwtConfig"); var signingKey = new SymmetricSecurityKey(Encoding.ASCII.GetBytes(jwtConfig.Secret)); +// Fail-fast for DB connection strings — surfaces a missing env var at startup +// instead of on the first request to a DB-backed endpoint. +var connectionStrings = builder.Configuration.GetSection(nameof(ConnectionStrings)).Get(); +if (connectionStrings == null + || string.IsNullOrEmpty(connectionStrings.AzaionDb) + || string.IsNullOrEmpty(connectionStrings.AzaionDbAdmin)) + throw new Exception("Missing configuration section: ConnectionStrings (AzaionDb and AzaionDbAdmin are required)"); + +// Graceful shutdown: 30 s for in-flight requests; pair with `docker stop -t 40`. +builder.Services.Configure(o => o.ShutdownTimeout = TimeSpan.FromSeconds(30)); + builder.Services.AddSerilog(); builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme) .AddJwtBearer(o => @@ -54,13 +66,9 @@ builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme) var apiAdminPolicy = new AuthorizationPolicyBuilder() .RequireRole(RoleEnum.ApiAdmin.ToString()).Build(); -var apiUploaderPolicy = new AuthorizationPolicyBuilder() - .RequireRole(RoleEnum.ResourceUploader.ToString(), RoleEnum.ApiAdmin.ToString()).Build(); - builder.Services.AddAuthorization(o => { o.AddPolicy(nameof(apiAdminPolicy), apiAdminPolicy); - o.AddPolicy(nameof(apiUploaderPolicy), apiUploaderPolicy); }); #endregion Policies @@ -98,7 +106,6 @@ builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); builder.Services.AddScoped(); -builder.Services.AddScoped(); builder.Services.AddSingleton(); builder.Services.AddLazyCache(); @@ -134,6 +141,39 @@ app.UseAuthorization(); app.UseRewriter(new RewriteOptions().AddRedirect("^$", "/swagger")); +#region Health endpoints +// Anonymous; expected to be exposed only on the management interface (not via the +// public Nginx vhost). Surface contract documented in +// _docs/04_deploy/deployment_procedures.md §2 and observability.md §7. + +app.MapGet("/health/live", (HttpContext http) => +{ + http.Response.Headers.CacheControl = "no-store"; + return Results.Ok(new { status = "live" }); +}).AllowAnonymous().ExcludeFromDescription(); + +app.MapGet("/health/ready", async (IDbFactory dbFactory, HttpContext http, CancellationToken ct) => +{ + http.Response.Headers.CacheControl = "no-store"; + using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + timeoutCts.CancelAfter(TimeSpan.FromSeconds(2)); + try + { + await dbFactory.Run(db => db.ExecuteAsync("SELECT 1")); + await dbFactory.RunAdmin(db => db.ExecuteAsync("SELECT 1")); + return Results.Ok(new { status = "ready" }); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !ct.IsCancellationRequested) + { + return Results.Json(new { status = "not-ready", reason = "db-timeout" }, statusCode: 503); + } + catch (Exception ex) + { + return Results.Json(new { status = "not-ready", reason = ex.GetType().Name }, statusCode: 503); + } +}).AllowAnonymous().ExcludeFromDescription(); +#endregion Health endpoints + app.MapPost("/login", async (LoginRequest request, IUserService userService, IAuthService authService, CancellationToken cancellationToken) => { @@ -298,32 +338,6 @@ app.MapDelete("/classes/{id:int}", .RequireAuthorization(apiAdminPolicy) .WithSummary("Deletes a detection class"); -app.MapPost("/get-update", - async (GetUpdateRequest request, IValidator validator, - IResourceUpdateService resourceUpdateService, CancellationToken ct) => - { - var validation = await validator.ValidateAsync(request, ct); - if (!validation.IsValid) - return Results.ValidationProblem(validation.ToDictionary()); - var updates = await resourceUpdateService.GetUpdate(request, ct); - return Results.Ok(updates); - }) - .RequireAuthorization() - .WithSummary("Returns resources newer than the device's reported current versions"); - -app.MapPost("/resources/publish", - async (PublishResourceRequest request, IValidator validator, - IResourceUpdateService resourceUpdateService, CancellationToken ct) => - { - var validation = await validator.ValidateAsync(request, ct); - if (!validation.IsValid) - return Results.ValidationProblem(validation.ToDictionary()); - await resourceUpdateService.Publish(request, ct); - return Results.Ok(); - }) - .RequireAuthorization(apiUploaderPolicy) - .WithSummary("CI/CD: publish a new resource version (encrypts encryption_key at rest, invalidates the per-(arch,stage) latest-versions cache)"); - app.UseExceptionHandler(_ => {}); app.Run(); diff --git a/Azaion.AdminApi/appsettings.json b/Azaion.AdminApi/appsettings.json index 0ce3fbf..95d9ed7 100644 --- a/Azaion.AdminApi/appsettings.json +++ b/Azaion.AdminApi/appsettings.json @@ -9,8 +9,7 @@ "ResourcesConfig": { "ResourcesFolder": "Content", "SuiteInstallerFolder": "suite", - "SuiteStageInstallerFolder": "suite-stage", - "EncryptionMasterKey": "" + "SuiteStageInstallerFolder": "suite-stage" }, "JwtConfig": { "Issuer": "AzaionApi", diff --git a/Azaion.Common/Azaion.Common.csproj b/Azaion.Common/Azaion.Common.csproj index 5659ec8..7e0dc2b 100644 --- a/Azaion.Common/Azaion.Common.csproj +++ b/Azaion.Common/Azaion.Common.csproj @@ -9,7 +9,7 @@ - + diff --git a/Azaion.Common/Configs/ResourcesConfig.cs b/Azaion.Common/Configs/ResourcesConfig.cs index 02b94c0..ecad0f2 100644 --- a/Azaion.Common/Configs/ResourcesConfig.cs +++ b/Azaion.Common/Configs/ResourcesConfig.cs @@ -5,11 +5,4 @@ public class ResourcesConfig public string ResourcesFolder { get; set; } = null!; public string SuiteInstallerFolder { get; set; } = null!; public string SuiteStageInstallerFolder { get; set; } = null!; - - /// - /// Master key used to AES-encrypt the per-resource encryption_key column at rest. - /// Required by AZ-183 constraint "encryption_key must be stored securely (... or via - /// application-level encryption)". Configure via ResourcesConfig__EncryptionMasterKey. - /// - public string EncryptionMasterKey { get; set; } = null!; } \ No newline at end of file diff --git a/Azaion.Common/Database/AzaionDb.cs b/Azaion.Common/Database/AzaionDb.cs index 4b7aff8..4d8b17e 100644 --- a/Azaion.Common/Database/AzaionDb.cs +++ b/Azaion.Common/Database/AzaionDb.cs @@ -8,5 +8,4 @@ public class AzaionDb(DataOptions dataOptions) : DataConnection(dataOptions) { public ITable Users => this.GetTable(); public ITable DetectionClasses => this.GetTable(); - public ITable Resources => this.GetTable(); } \ No newline at end of file diff --git a/Azaion.Common/Database/AzaionDbShemaHolder.cs b/Azaion.Common/Database/AzaionDbShemaHolder.cs index a8ce367..aa06444 100644 --- a/Azaion.Common/Database/AzaionDbShemaHolder.cs +++ b/Azaion.Common/Database/AzaionDbShemaHolder.cs @@ -42,12 +42,6 @@ public static class AzaionDbSchemaHolder .IsPrimaryKey() .IsIdentity(); - builder.Entity() - .HasTableName("resources") - .Property(x => x.Id) - .IsPrimaryKey() - .HasDataType(DataType.Guid); - builder.Build(); } } \ No newline at end of file diff --git a/Azaion.Common/Entities/Resource.cs b/Azaion.Common/Entities/Resource.cs deleted file mode 100644 index 0f77176..0000000 --- a/Azaion.Common/Entities/Resource.cs +++ /dev/null @@ -1,15 +0,0 @@ -namespace Azaion.Common.Entities; - -public class Resource -{ - public Guid Id { get; set; } - public string ResourceName { get; set; } = null!; - public string DevStage { get; set; } = null!; - public string Architecture { get; set; } = null!; - public string Version { get; set; } = null!; - public string CdnUrl { get; set; } = null!; - public string Sha256 { get; set; } = null!; - public string EncryptionKey { get; set; } = null!; - public long SizeBytes { get; set; } - public DateTime CreatedAt { get; set; } -} diff --git a/Azaion.Common/Requests/GetUpdateRequest.cs b/Azaion.Common/Requests/GetUpdateRequest.cs deleted file mode 100644 index 5b5d151..0000000 --- a/Azaion.Common/Requests/GetUpdateRequest.cs +++ /dev/null @@ -1,35 +0,0 @@ -using FluentValidation; - -namespace Azaion.Common.Requests; - -public class GetUpdateRequest -{ - public string Architecture { get; set; } = null!; - public string DevStage { get; set; } = null!; - - /// - /// Map of resource_name → currently-installed-version. Resources missing - /// from the map are treated as "device has no version of this resource yet" and - /// will be returned in the response if any version exists server-side. - /// - public Dictionary CurrentVersions { get; set; } = new(); -} - -public class GetUpdateValidator : AbstractValidator -{ - public GetUpdateValidator() - { - RuleFor(r => r.Architecture).NotEmpty().MaximumLength(40); - RuleFor(r => r.DevStage).NotEmpty().MaximumLength(40); - } -} - -public class ResourceUpdateItem -{ - public string ResourceName { get; set; } = null!; - public string Version { get; set; } = null!; - public string CdnUrl { get; set; } = null!; - public string Sha256 { get; set; } = null!; - public string EncryptionKey { get; set; } = null!; - public long SizeBytes { get; set; } -} diff --git a/Azaion.Common/Requests/PublishResourceRequest.cs b/Azaion.Common/Requests/PublishResourceRequest.cs deleted file mode 100644 index 2e022a4..0000000 --- a/Azaion.Common/Requests/PublishResourceRequest.cs +++ /dev/null @@ -1,30 +0,0 @@ -using FluentValidation; - -namespace Azaion.Common.Requests; - -public class PublishResourceRequest -{ - public string ResourceName { get; set; } = null!; - public string DevStage { get; set; } = null!; - public string Architecture { get; set; } = null!; - public string Version { get; set; } = null!; - public string CdnUrl { get; set; } = null!; - public string Sha256 { get; set; } = null!; - public string EncryptionKey { get; set; } = null!; - public long SizeBytes { get; set; } -} - -public class PublishResourceValidator : AbstractValidator -{ - public PublishResourceValidator() - { - RuleFor(r => r.ResourceName).NotEmpty().MaximumLength(120); - RuleFor(r => r.DevStage).NotEmpty().MaximumLength(40); - RuleFor(r => r.Architecture).NotEmpty().MaximumLength(40); - RuleFor(r => r.Version).NotEmpty().MaximumLength(40); - RuleFor(r => r.CdnUrl).NotEmpty().MaximumLength(500); - RuleFor(r => r.Sha256).NotEmpty().MaximumLength(128); - RuleFor(r => r.EncryptionKey).NotEmpty(); - RuleFor(r => r.SizeBytes).GreaterThan(0); - } -} diff --git a/Azaion.Services/Azaion.Services.csproj b/Azaion.Services/Azaion.Services.csproj index d8d6d85..5a520e1 100644 --- a/Azaion.Services/Azaion.Services.csproj +++ b/Azaion.Services/Azaion.Services.csproj @@ -16,7 +16,7 @@ - + diff --git a/Azaion.Services/ResourceUpdateService.cs b/Azaion.Services/ResourceUpdateService.cs deleted file mode 100644 index 6fecaba..0000000 --- a/Azaion.Services/ResourceUpdateService.cs +++ /dev/null @@ -1,140 +0,0 @@ -using System.Security.Cryptography; -using System.Text; -using Azaion.Common.Configs; -using Azaion.Common.Database; -using Azaion.Common.Entities; -using Azaion.Common.Requests; -using LinqToDB; -using Microsoft.Extensions.Options; - -namespace Azaion.Services; - -public interface IResourceUpdateService -{ - Task> GetUpdate(GetUpdateRequest request, CancellationToken ct = default); - Task Publish(PublishResourceRequest request, CancellationToken ct = default); -} - -public class ResourceUpdateService( - IDbFactory dbFactory, - ICache cache, - IOptions resourcesConfig) : IResourceUpdateService -{ - public static string CacheKey(string architecture, string devStage) - => $"Resources.Latest.{architecture}.{devStage}"; - - public async Task> GetUpdate(GetUpdateRequest request, CancellationToken ct = default) - { - var latest = await cache.GetFromCacheAsync( - CacheKey(request.Architecture, request.DevStage), - () => LoadLatest(request.Architecture, request.DevStage, ct)); - - var updates = new List(); - foreach (var (resourceName, resource) in latest) - { - var currentVersion = request.CurrentVersions.GetValueOrDefault(resourceName, ""); - if (string.CompareOrdinal(resource.Version, currentVersion) <= 0) - continue; - - updates.Add(new ResourceUpdateItem - { - ResourceName = resource.ResourceName, - Version = resource.Version, - CdnUrl = resource.CdnUrl, - Sha256 = resource.Sha256, - EncryptionKey = ResourceColumnEncryption.Decrypt(resource.EncryptionKey, MasterKey), - SizeBytes = resource.SizeBytes - }); - } - return updates; - } - - public async Task Publish(PublishResourceRequest request, CancellationToken ct = default) - { - await dbFactory.RunAdmin(async db => - { - await db.InsertAsync(new Resource - { - Id = Guid.NewGuid(), - ResourceName = request.ResourceName, - DevStage = request.DevStage, - Architecture = request.Architecture, - Version = request.Version, - CdnUrl = request.CdnUrl, - Sha256 = request.Sha256, - EncryptionKey = ResourceColumnEncryption.Encrypt(request.EncryptionKey, MasterKey), - SizeBytes = request.SizeBytes, - CreatedAt = DateTime.UtcNow - }, token: ct); - }); - cache.Invalidate(CacheKey(request.Architecture, request.DevStage)); - } - - private async Task> LoadLatest(string architecture, string devStage, CancellationToken ct) => - await dbFactory.Run(async db => - { - var rows = await db.Resources - .Where(r => r.Architecture == architecture && r.DevStage == devStage) - .ToListAsync(token: ct); - - return rows - .GroupBy(r => r.ResourceName) - .Select(g => g.OrderByDescending(r => r.Version, StringComparer.Ordinal).First()) - .ToDictionary(r => r.ResourceName); - }); - - private string MasterKey - { - get - { - var key = resourcesConfig.Value.EncryptionMasterKey; - if (string.IsNullOrEmpty(key)) - throw new InvalidOperationException( - "ResourcesConfig.EncryptionMasterKey is not configured. Set it via " + - "appsettings ResourcesConfig:EncryptionMasterKey or env ResourcesConfig__EncryptionMasterKey."); - return key; - } - } -} - -internal static class ResourceColumnEncryption -{ - public static string Encrypt(string plaintext, string masterKey) - { - using var aes = Aes.Create(); - aes.Mode = CipherMode.CBC; - aes.Padding = PaddingMode.PKCS7; - aes.Key = SHA256.HashData(Encoding.UTF8.GetBytes(masterKey)); - aes.GenerateIV(); - - var input = Encoding.UTF8.GetBytes(plaintext); - using var encryptor = aes.CreateEncryptor(); - var cipher = encryptor.TransformFinalBlock(input, 0, input.Length); - - var combined = new byte[aes.IV.Length + cipher.Length]; - Buffer.BlockCopy(aes.IV, 0, combined, 0, aes.IV.Length); - Buffer.BlockCopy(cipher, 0, combined, aes.IV.Length, cipher.Length); - return Convert.ToBase64String(combined); - } - - public static string Decrypt(string ciphertextBase64, string masterKey) - { - var combined = Convert.FromBase64String(ciphertextBase64); - using var aes = Aes.Create(); - aes.Mode = CipherMode.CBC; - aes.Padding = PaddingMode.PKCS7; - aes.Key = SHA256.HashData(Encoding.UTF8.GetBytes(masterKey)); - - var ivLen = aes.BlockSize / 8; - var iv = new byte[ivLen]; - Buffer.BlockCopy(combined, 0, iv, 0, ivLen); - aes.IV = iv; - - var cipher = new byte[combined.Length - ivLen]; - Buffer.BlockCopy(combined, ivLen, cipher, 0, cipher.Length); - - using var decryptor = aes.CreateDecryptor(); - var plain = decryptor.TransformFinalBlock(cipher, 0, cipher.Length); - return Encoding.UTF8.GetString(plain); - } -} diff --git a/Azaion.Services/UserService.cs b/Azaion.Services/UserService.cs index 58e2137..eae18d6 100644 --- a/Azaion.Services/UserService.cs +++ b/Azaion.Services/UserService.cs @@ -5,6 +5,7 @@ using Azaion.Common.Entities; using Azaion.Common.Extensions; using Azaion.Common.Requests; using LinqToDB; +using Npgsql; namespace Azaion.Services; @@ -31,27 +32,49 @@ public class UserService(IDbFactory dbFactory, ICache cache) : IUserService public async Task RegisterUser(RegisterUserRequest request, CancellationToken ct = default) { - await dbFactory.RunAdmin(async db => + try { - var existingUser = await db.Users.FirstOrDefaultAsync(u => u.Email == request.Email, token: ct); - if (existingUser != null) - throw new BusinessException(ExceptionEnum.EmailExists); - - await db.InsertAsync(new User + await dbFactory.RunAdmin(async db => { - Id = Guid.NewGuid(), - Email = request.Email, - PasswordHash = request.Password.ToHash(), - Role = request.Role, - CreatedAt = DateTime.UtcNow, - IsEnabled = true - }, token: ct); - }); + await db.InsertAsync(new User + { + Id = Guid.NewGuid(), + Email = request.Email, + PasswordHash = request.Password.ToHash(), + Role = request.Role, + CreatedAt = DateTime.UtcNow, + IsEnabled = true + }, token: ct); + }); + } + catch (PostgresException ex) when (ex.SqlState == PostgresErrorCodes.UniqueViolation) + { + throw new BusinessException(ExceptionEnum.EmailExists); + } } public async Task RegisterDevice(CancellationToken ct = default) { - return await dbFactory.RunAdmin(async db => + var (serial, email) = await NextDeviceIdentity(ct); + var password = Convert.ToHexString(RandomNumberGenerator.GetBytes(DevicePasswordBytes)).ToLowerInvariant(); + + await RegisterUser(new RegisterUserRequest + { + Email = email, + Password = password, + Role = RoleEnum.CompanionPC + }, ct); + + return new RegisterDeviceResponse + { + Serial = serial, + Email = email, + Password = password + }; + } + + private async Task<(string Serial, string Email)> NextDeviceIdentity(CancellationToken ct) => + await dbFactory.Run(async db => { var lastEmail = await db.Users .Where(u => u.Role == RoleEnum.CompanionPC) @@ -67,28 +90,10 @@ public class UserService(IDbFactory dbFactory, ICache cache) : IUserService nextNumber = current + 1; } - var serial = $"{DeviceEmailPrefix}{nextNumber.ToString($"D{SerialNumberLength}")}"; - var email = $"{serial}{DeviceEmailDomain}"; - var password = Convert.ToHexString(RandomNumberGenerator.GetBytes(DevicePasswordBytes)).ToLowerInvariant(); - - await db.InsertAsync(new User - { - Id = Guid.NewGuid(), - Email = email, - PasswordHash = password.ToHash(), - Role = RoleEnum.CompanionPC, - CreatedAt = DateTime.UtcNow, - IsEnabled = true - }, token: ct); - - return new RegisterDeviceResponse - { - Serial = serial, - Email = email, - Password = password - }; + var serial = $"{DeviceEmailPrefix}{nextNumber.ToString($"D{SerialNumberLength}")}"; + var email = $"{serial}{DeviceEmailDomain}"; + return (serial, email); }); - } public async Task GetByEmail(string? email, CancellationToken ct = default) { diff --git a/Dockerfile b/Dockerfile index 62d8c6c..1f2232e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,14 @@ FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS base +# curl is needed by the HEALTHCHECK below. CA certs and ICU are already in the +# aspnet:10.0 image. Trim the apt cache to keep the layer small. +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* +# Non-root user (security audit F-6 / AZ-518). The aspnet:10.0 image ships an +# `app` user; we only need to create + chown the dirs that get bind-mounted +# from the host so the runtime can write to them. +RUN mkdir -p /app/Content /app/logs \ + && chown -R app:app /app WORKDIR /app EXPOSE 8080 @@ -19,7 +29,15 @@ RUN arch=$([ "$TARGETARCH" = "amd64" ] && echo "x64" || echo "$TARGETARCH") && \ # Build runtime FROM base AS final ARG CI_COMMIT_SHA=unknown +ARG BUILD_DATE=unknown ENV AZAION_REVISION=$CI_COMMIT_SHA +LABEL org.opencontainers.image.title="azaion.admin-api" \ + org.opencontainers.image.revision="$CI_COMMIT_SHA" \ + org.opencontainers.image.created="$BUILD_DATE" \ + org.opencontainers.image.source="https://git.azaion.com/azaion/admin" WORKDIR /app -COPY --from=publish /app/publish . +COPY --from=publish --chown=app:app /app/publish . +USER app +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \ + CMD curl --fail --silent --show-error http://localhost:8080/health/live || exit 1 ENTRYPOINT ["dotnet", "Azaion.AdminApi.dll"] diff --git a/_docs/02_document/architecture.md b/_docs/02_document/architecture.md index bf6320c..92d8ed6 100644 --- a/_docs/02_document/architecture.md +++ b/_docs/02_document/architecture.md @@ -62,10 +62,13 @@ | Entity | Description | Owned By Component | |--------|-------------|--------------------| -| User | System user with email, password hash, role, config (legacy `Hardware` column tombstoned per AZ-197) | 01 Data Layer | +| User | System user with email (UNIQUE-indexed via `users_email_uidx`), password hash, role, config (legacy `Hardware` column tombstoned per AZ-197). Subset of users have `Role = CompanionPC` and are auto-provisioned via `POST /devices` (AZ-196), which delegates the insert to `UserService.RegisterUser` (post-security-audit consolidation, finding F-3). | 01 Data Layer | | UserConfig | JSON-serialized per-user configuration (queue offsets) | 01 Data Layer | -| RoleEnum | Authorization role hierarchy (None → ApiAdmin) | 01 Data Layer | -| ExceptionEnum | Business error code catalog | Common Helpers | +| RoleEnum | Authorization role hierarchy (None → ApiAdmin); `ResourceUploader` retained as data only after the OTA endpoints were retired | 01 Data Layer | +| DetectionClass *(AZ-513, cycle 1)* | Operator-managed detection-class catalogue (Name, ShortName, Color, MaxSizeM, PhotoMode?) backing the UI Detection Classes table | 01 Data Layer | +| ExceptionEnum | Business error code catalog (HW-related codes 40/45 removed by AZ-197) | Common Helpers | + +> **Removed in cycle 1 / post-cycle-1**: the `Resource` entity, the `resources` table, and the OTA delivery flow (AZ-183 — F10) were reverted after the security audit (finding F-1). The data model no longer carries an OTA-artifact entity. **Key relationships**: - User → RoleEnum: each user has exactly one role @@ -112,9 +115,10 @@ No explicit availability, latency, throughput, or recovery targets found in the **Authorization**: Role-based (RBAC) via ASP.NET Core authorization policies: - `apiAdminPolicy` — requires `ApiAdmin` role -- `apiUploaderPolicy` — requires `ResourceUploader` or `ApiAdmin` (defined but never applied to any endpoint) - General `[Authorize]` — any authenticated user +> The `apiUploaderPolicy` was added by AZ-183 and removed in the post-cycle-1 revert along with the OTA endpoints it guarded. `RoleEnum.ResourceUploader` remains as data only. + **Data protection**: - At rest: Resources encrypted with AES-256-CBC using per-user derived key (email + password). The hardware-hash component was removed in AZ-197 (sealed-Jetson + SaaS architecture). - In transit: HTTPS (assumed, not enforced in code) diff --git a/_docs/02_document/components/01_data_layer/description.md b/_docs/02_document/components/01_data_layer/description.md index 2c72889..88353f1 100644 --- a/_docs/02_document/components/01_data_layer/description.md +++ b/_docs/02_document/components/01_data_layer/description.md @@ -29,12 +29,14 @@ ### Entities +> **Cycle 1 (2026-05-13) note** — `DetectionClass` (AZ-513) entity was added. `Resource` (AZ-183) was added then removed in the same cycle (post-cycle-1 revert; security audit F-1 + the OTA delivery model itself was deemed obsolete). The `User.Hardware` column is left in place as a tombstone (nullable, unused) per AZ-197. A UNIQUE INDEX `users_email_uidx` was added on `users.email` (security audit F-3, `env/db/06_users_email_unique.sql`). + ``` User: Id: Guid (PK) Email: string (required) PasswordHash: string (required) - Hardware: string? (optional) + Hardware: string? (optional — TOMBSTONED by AZ-197; nullable, unused; no application code reads or writes) Role: RoleEnum (required) CreatedAt: DateTime (required) LastLogin: DateTime? (optional) @@ -49,7 +51,19 @@ UserQueueOffsets: AnnotationsConfirmOffset: ulong AnnotationsCommandsOffset: ulong +DetectionClass (AZ-513): + Id: int (PK, DB-assigned identity) + Name, ShortName, Color: string + MaxSizeM: double + PhotoMode: string? + CreatedAt: DateTime + +// Resource entity — REMOVED post-cycle-1 (AZ-183 reverted). The `resources` +// table no longer exists; see env/db/ for the current migration set. + RoleEnum: None=0, Operator=10, Validator=20, CompanionPC=30, Admin=40, ResourceUploader=50, ApiAdmin=1000 +// ResourceUploader is now data-only — no endpoint policy references it +// after AZ-183 was reverted. ``` ### Configuration POCOs @@ -69,6 +83,7 @@ ResourcesConfig: ResourcesFolder: string SuiteInstallerFolder: string SuiteStageInstallerFolder: string + # EncryptionMasterKey was added by AZ-183 and removed in the post-cycle-1 revert. ``` ## 3. External API Specification @@ -81,23 +96,26 @@ N/A — internal component. | Query | Frequency | Hot Path | Index Needed | |-------|-----------|----------|--------------| -| `SELECT * FROM users WHERE email = ?` | High | Yes | Yes (email) | +| `SELECT * FROM users WHERE email = ?` | High | Yes | Yes — UNIQUE INDEX `users_email_uidx` on `email` (security audit F-3, `env/db/06_users_email_unique.sql`) | | `SELECT * FROM users` with optional filters | Medium | No | No | | `UPDATE users SET ... WHERE email = ?` | Medium | No | No | -| `INSERT INTO users` | Low | No | No | +| `INSERT INTO users` | Low | No | No (UNIQUE INDEX above also enforces single-row-per-email atomically) | | `DELETE FROM users WHERE email = ?` | Low | No | No | ### Caching Strategy | Data | Cache Type | TTL | Invalidation | |------|-----------|-----|-------------| -| User by email | In-memory (LazyCache) | 4 hours | On hardware update, queue offset update, hardware check | +| User by email | In-memory (LazyCache) | 4 hours | On `UpdateQueueOffsets` (post-AZ-197 — hardware paths gone) | + +> The `Resources.Latest.{arch}.{stage}` cache key (added by AZ-183) was removed in the post-cycle-1 revert. ### Storage Estimates | Table | Est. Row Count (1yr) | Row Size | Total Size | Growth Rate | |-------|---------------------|----------|------------|-------------| -| `users` | 100–1000 | ~500 bytes | ~500 KB | Low | +| `users` | 100–1000 web users + 2000–10000 CompanionPC device users (AZ-196 grows this) | ~500 bytes | ~5 MB | Medium (device fleet) | +| `detection_classes` (AZ-513) | 10–200 | ~250 bytes | ~50 KB | Low | ### Data Management @@ -116,7 +134,7 @@ N/A — internal component. | linq2db | 5.4.1 | ORM for PostgreSQL access | | Npgsql | 10.0.1 | PostgreSQL ADO.NET provider | | LazyCache | 2.4.0 | In-memory cache with async support | -| Newtonsoft.Json | 13.0.1 | JSON serialization for UserConfig | +| Newtonsoft.Json | 13.0.4 | JSON serialization for UserConfig (bumped from 13.0.1 by security audit D-1, GHSA-5crp-9r3c-p9vr) | **Error Handling Strategy**: - `DbFactory.LoadOptions` throws `ArgumentException` on empty connection strings (fail-fast at startup). @@ -167,7 +185,8 @@ N/A — internal component. - `Common/Configs/ResourcesConfig` - `Common/Entities/User` - `Common/Entities/RoleEnum` -- `Common/Database/AzaionDb` +- `Common/Entities/DetectionClass` *(added cycle 1, AZ-513)* +- `Common/Database/AzaionDb` (now also holds the `DetectionClasses` table; the `Resources` ITable added by AZ-183 was removed in the post-cycle-1 revert) - `Common/Database/AzaionDbSchemaHolder` - `Common/Database/DbFactory` - `Services/Cache` diff --git a/_docs/02_document/components/02_user_management/description.md b/_docs/02_document/components/02_user_management/description.md index d3c7799..f05fc62 100644 --- a/_docs/02_document/components/02_user_management/description.md +++ b/_docs/02_document/components/02_user_management/description.md @@ -1,12 +1,14 @@ # User Management +> **Cycle 1 (2026-05-13) note** — hardware-binding methods (`UpdateHardware`, `CheckHardwareHash`) and `SetHWRequest` were removed by AZ-197; the `ValidateUser` error set now includes `UserDisabled`; `RegisterDevice` was added by AZ-196 to back the new `POST /devices` endpoint. Post-cycle-1 (security audit F-3): `RegisterDevice` now reuses `RegisterUser` for the row insert; the duplicate-row race was closed by adding a UNIQUE INDEX on `users.email` (`env/db/06_users_email_unique.sql`) and translating `Npgsql.PostgresException(SqlState=23505)` to `BusinessException(EmailExists)` inside `RegisterUser`. + ## 1. High-Level Overview -**Purpose**: Full user lifecycle management — registration, credential validation, hardware binding, role changes, account enable/disable, and deletion. +**Purpose**: Full user lifecycle management — web-user registration, credential validation, role changes, account enable/disable, deletion, plus auto-provisioning of CompanionPC device users. **Architectural Pattern**: Service layer — stateless business logic operating on the Data Layer through `IDbFactory`. -**Upstream dependencies**: Data Layer (IDbFactory, ICache, User entity), Security & Cryptography (hashing). +**Upstream dependencies**: Data Layer (IDbFactory, ICache, User entity), Security & Cryptography (hashing), `System.Security.Cryptography.RandomNumberGenerator` (device password entropy). **Downstream consumers**: Admin API (endpoint handlers), Authentication (GetByEmail). @@ -16,18 +18,19 @@ | Method | Input | Output | Async | Error Types | |--------|-------|--------|-------|-------------| -| `RegisterUser` | `RegisterUserRequest, CancellationToken` | void | Yes | `BusinessException(EmailExists)` | -| `ValidateUser` | `LoginRequest, CancellationToken` | `User` | Yes | `BusinessException(NoEmailFound, WrongPassword)` | +| `RegisterUser` | `RegisterUserRequest, CancellationToken` | void | Yes | `BusinessException(EmailExists)` — translated from `PostgresException(23505)` after the F-3 hardening | +| `RegisterDevice` | `CancellationToken` | `RegisterDeviceResponse` | Yes | `BusinessException(EmailExists)` (propagated from `RegisterUser`) — added by AZ-196, refactored post-audit to call `RegisterUser` end-to-end | +| `ValidateUser` | `LoginRequest, CancellationToken` | `User` | Yes | `BusinessException(NoEmailFound, WrongPassword, UserDisabled)` | | `GetByEmail` | `string? email, CancellationToken` | `User?` | Yes | `ArgumentNullException` | -| `UpdateHardware` | `string email, string? hardware, CancellationToken` | void | Yes | None | | `UpdateQueueOffsets` | `string email, UserQueueOffsets, CancellationToken` | void | Yes | None | | `GetUsers` | `string? searchEmail, RoleEnum? searchRole, CancellationToken` | `IEnumerable` | Yes | None | -| `CheckHardwareHash` | `User, string hardware, CancellationToken` | `string` (hash) | Yes | `BusinessException(HardwareIdMismatch)` | | `ChangeRole` | `string email, RoleEnum, CancellationToken` | void | Yes | None | | `SetEnableStatus` | `string email, bool, CancellationToken` | void | Yes | None | | `RemoveUser` | `string email, CancellationToken` | void | Yes | None | -**Input DTOs**: +**Removed by AZ-197**: `UpdateHardware`, `CheckHardwareHash`, and the private `UpdateLastLoginDate` helper. + +**Input / Output DTOs**: ``` RegisterUserRequest: Email: string (required) — validated: min 8 chars, valid email format @@ -38,9 +41,10 @@ LoginRequest: Email: string (required) Password: string (required) -SetHWRequest: - Email: string (required, validated: not empty) - Hardware: string? (optional — null clears hardware) +RegisterDeviceResponse (AZ-196): + Serial: string ("azj-NNNN", zero-padded) + Email: string ("azj-NNNN@azaion.com") + Password: string (32-char hex, plaintext, exposed exactly once) SetUserQueueOffsetsRequest: Email: string (required) @@ -67,7 +71,7 @@ N/A — exposed through Admin API component. | Data | Cache Type | TTL | Invalidation | |------|-----------|-----|-------------| -| User by email | In-memory (via ICache) | 4 hours | After UpdateHardware, UpdateQueueOffsets, CheckHardwareHash (first login) | +| User by email | In-memory (via ICache) | 4 hours | After `UpdateQueueOffsets` (only — `UpdateHardware` / `CheckHardwareHash` invalidations are gone with AZ-197) | ## 5. Implementation Details @@ -89,20 +93,21 @@ N/A — exposed through Admin API component. | Helper | Purpose | Used By | |--------|---------|---------| -| `Security.ToHash` | Password hashing (SHA-384) | RegisterUser, ValidateUser | -| `Security.GetHWHash` | Hardware fingerprint hashing | CheckHardwareHash | +| `Security.ToHash` | Password hashing (SHA-384) | RegisterUser, RegisterDevice, ValidateUser | +| `RandomNumberGenerator.GetBytes(16)` + `Convert.ToHexString` | 32-char hex device password | RegisterDevice | | `QueryableExtensions.WhereIf` | Conditional LINQ filters | GetUsers | ## 7. Caveats & Edge Cases **Known limitations**: - No pagination on `GetUsers` — returns all matching users. -- `CheckHardwareHash` auto-stores hardware on first access (no explicit admin approval step). - `RemoveUser` is a hard delete, not soft delete. +- `RegisterDevice` returns the plaintext password to the caller exactly once; if the provisioning script loses it, the device must be re-registered. +- The `User.Hardware` column is left in place but unused (AZ-197 chose to leave the column nullable rather than ship a migration). **Potential race conditions**: - Concurrent `RegisterUser` calls with the same email: both could pass the existence check before insert. Mitigated by database unique constraint on email (if one exists). -- `CheckHardwareHash` first-login path: concurrent requests could trigger multiple hardware updates. +- Concurrent `RegisterDevice` calls: both could read the same "most recent CompanionPC" row and try to claim the same `azj-NNNN` serial. Mitigated by the `users.email` unique constraint — the loser will fail the insert. (Out of cycle-1 scope: a sequence-based serial allocator would eliminate the retry.) **Performance bottlenecks**: - `GetUsers` loads full user objects including `UserConfig` JSON; for large user bases, projection would be more efficient. @@ -123,5 +128,7 @@ No explicit logging in UserService. - `Services/UserService` - `Common/Requests/LoginRequest` - `Common/Requests/RegisterUserRequest` -- `Common/Requests/SetHWRequest` +- `Common/Requests/RegisterDeviceResponse` *(added cycle 1, AZ-196)* - `Common/Requests/SetUserQueueOffsetsRequest` + +**Removed cycle 1 (AZ-197)**: `Common/Requests/SetHWRequest` diff --git a/_docs/02_document/components/03_auth_and_security/description.md b/_docs/02_document/components/03_auth_and_security/description.md index 1798485..7eb34a4 100644 --- a/_docs/02_document/components/03_auth_and_security/description.md +++ b/_docs/02_document/components/03_auth_and_security/description.md @@ -1,14 +1,16 @@ # Authentication & Security +> **Cycle 1 (2026-05-13) note** — AZ-197 simplified `GetApiEncryptionKey` to `(email, password)` and removed `GetHWHash` outright. The hardware-binding threat model that motivated those primitives is no longer in scope (fTPM-anchored Jetsons + browser SaaS). + ## 1. High-Level Overview -**Purpose**: JWT token creation/validation and cryptographic utilities (password hashing, hardware fingerprint hashing, AES file encryption/decryption). +**Purpose**: JWT token creation/validation and cryptographic utilities (password hashing, AES file encryption/decryption). **Architectural Pattern**: Service + static utility — `AuthService` is a DI-managed service for JWT operations; `Security` is a static class for cryptographic primitives. **Upstream dependencies**: Data Layer (JwtConfig, IUserService for GetByEmail), ASP.NET Core (IHttpContextAccessor). -**Downstream consumers**: Admin API (token creation on login, current user resolution), User Management (password hashing, hardware hashing), Resource Management (encryption key derivation, stream encryption). +**Downstream consumers**: Admin API (token creation on login, current user resolution), User Management (password hashing for both web users and provisioned devices), Resource Management (encryption key derivation, stream encryption). ## 2. Internal Interfaces @@ -24,11 +26,12 @@ | Method | Input | Output | Description | |--------|-------|--------|-------------| | `ToHash` | `string` | `string` (Base64) | SHA-384 hash | -| `GetHWHash` | `string hardware` | `string` (Base64) | Salted hardware hash | -| `GetApiEncryptionKey` | `string email, string password, string? hwHash` | `string` (Base64) | Derives AES encryption key | +| `GetApiEncryptionKey` | `string email, string password` | `string` (Base64) | Derives the per-user AES encryption key string. **Signature simplified by AZ-197** (`hardwareHash` parameter removed). | | `EncryptTo` | `Stream input, Stream output, string key, CancellationToken` | void | AES-256-CBC encrypt stream | | `DecryptTo` | `Stream encrypted, Stream output, string key, CancellationToken` | void | AES-256-CBC decrypt stream | +**Removed by AZ-197**: `GetHWHash(string hardware)` — no remaining callers in the post-cycle-1 codebase. + ## 3. External API Specification N/A — exposed through Admin API. @@ -62,11 +65,13 @@ None — `Security` itself is a utility consumed by other components. ## 7. Caveats & Edge Cases **Known limitations**: -- Password hashing uses SHA-384 without per-user salt or key stretching. Not resistant to rainbow table attacks. -- Hardware and encryption key salts are hardcoded constants. +- Password hashing uses SHA-384 without per-user salt or key stretching. Not resistant to rainbow table attacks. (Unchanged by cycle 1.) +- The encryption-key salt is a hardcoded constant. (`Security.GetApiEncryptionKey` body — see `services_security.md`.) - `GetCurrentUserEmail` assumes `ClaimTypes.Name` is always present; accessing a missing key would throw `KeyNotFoundException`. - AES encryption prepends IV as first 16 bytes — consumers must know this format. +**Removed in cycle 1**: hardware fingerprint hashing was a known weakness (static salt, no rotation); deleting it via AZ-197 also removed that attack surface. + **Performance bottlenecks**: - Large file encryption loads encrypted output into `MemoryStream` before sending — high memory usage for large files. diff --git a/_docs/02_document/components/04_resource_management/description.md b/_docs/02_document/components/04_resource_management/description.md index acb2e89..aff0487 100644 --- a/_docs/02_document/components/04_resource_management/description.md +++ b/_docs/02_document/components/04_resource_management/description.md @@ -1,12 +1,14 @@ # Resource Management +> **Cycle 1 (2026-05-13) note** — AZ-197 removed the `Hardware` field from `GetResourceRequest` and removed `CheckResourceRequest` and `POST /resources/check` entirely. AZ-183 introduced an OTA update path (`POST /get-update`, `POST /resources/publish`, `IResourceUpdateService`, `Resource` entity, `resources` table, `ResourcesConfig.EncryptionMasterKey`) but it was reverted later the same day after the security audit (finding F-1) — the OTA delivery model itself was deemed obsolete. The component is now back to filesystem-backed storage only. + ## 1. High-Level Overview -**Purpose**: Server-side file storage management — upload, list, download (with per-user AES encryption), folder clearing, and installer distribution. +**Purpose**: filesystem-backed storage — upload, list, download (per-user AES-encrypted), folder clearing, installer distribution. Owned by `IResourcesService`. -**Architectural Pattern**: Service layer — filesystem operations with encryption applied at the service boundary. +**Architectural Pattern**: a single service over the local filesystem. No DB access, no cache. -**Upstream dependencies**: Data Layer (ResourcesConfig), Authentication & Security (encryption via Security.EncryptTo). +**Upstream dependencies**: Data Layer (`ResourcesConfig`), Authentication & Security (encryption via `Security.EncryptTo`). **Downstream consumers**: Admin API (resource endpoints). @@ -22,15 +24,15 @@ | `ListResources` | `string? dataFolder, string? search, CancellationToken` | `IEnumerable` | Yes | `DirectoryNotFoundException` | | `ClearFolder` | `string? dataFolder` | void | No | None | -**Input DTOs**: +**Input DTO**: ``` -GetResourceRequest: +GetResourceRequest (post-AZ-197): Password: string (required, min 8 chars) - Hardware: string (required, not empty) FileName: string (required, not empty) + // Hardware field removed by AZ-197. -CheckResourceRequest: - Hardware: string (required) +// CheckResourceRequest — REMOVED by AZ-197. +// GetUpdateRequest, PublishResourceRequest — added by AZ-183, removed in the post-cycle-1 revert. ``` ## 3. External API Specification @@ -39,17 +41,21 @@ N/A — exposed through Admin API. ## 4. Data Access Patterns -No database access. All operations are filesystem-based. +`ResourcesService` is filesystem-only — no DB access, no cache. + +| Source | Service | Pattern | +|--------|---------|---------| +| Filesystem (`ResourcesConfig.ResourcesFolder`) | `ResourcesService` | Direct read/write/delete | ### Storage Estimates -Resources are stored as flat files in configured directories. Size depends on uploaded content (AI models, DLLs, installers — potentially hundreds of MB per file). +- **Filesystem**: AI models, DLLs, installers — potentially hundreds of MB per file. ## 5. Implementation Details -**State Management**: Stateless — reads/writes directly to filesystem. +**State Management**: stateless — reads/writes directly to filesystem. -**Key Dependencies**: None beyond BCL (System.IO). +**Key Dependencies**: none beyond BCL (System.IO). **Error Handling Strategy**: - `SaveResource` throws `BusinessException(NoFileProvided)` for null uploads. @@ -61,13 +67,13 @@ Resources are stored as flat files in configured directories. Size depends on up | Helper | Purpose | Used By | |--------|---------|---------| -| `Security.EncryptTo` | AES stream encryption | GetEncryptedResource | -| `Security.GetApiEncryptionKey` | Key derivation | Admin API (before calling GetEncryptedResource) | +| `Security.EncryptTo` | AES stream encryption | `GetEncryptedResource` | +| `Security.GetApiEncryptionKey(email, password)` | Per-user key derivation (post-AZ-197 — no hardware component) | Admin API (before calling `GetEncryptedResource`) | ## 7. Caveats & Edge Cases -**Known limitations**: -- No path traversal protection: `dataFolder` parameter is concatenated directly with `ResourcesFolder`. A malicious `dataFolder` like `../../etc` could access arbitrary filesystem paths. +**Known limitations** (security-audit findings): +- **F-2 (High)** — no path traversal protection: `dataFolder` parameter is concatenated directly with `ResourcesFolder`. A malicious `dataFolder` like `../../etc` could access arbitrary filesystem paths. Filed as separate ticket. - `SaveResource` deletes existing file before writing — no versioning or backup. - `GetEncryptedResource` loads the entire encrypted file into a `MemoryStream` — memory-intensive for large files. - `ListResources` wraps a synchronous `DirectoryInfo.GetFiles` in `Task.FromResult` — not truly async. @@ -90,11 +96,11 @@ Resources are stored as flat files in configured directories. Size depends on up |-----------|------|---------| | INFO | Successful file save | `Resource {data.FileName} Saved Successfully` | -**Log format**: String interpolation via Serilog. +**Log format**: string interpolation via Serilog (security audit F-12 hygiene item: convert to structured form). -**Log storage**: Console + rolling file (via Serilog configured in Program.cs). +**Log storage**: console + rolling file (via Serilog configured in Program.cs). ## Modules Covered - `Services/ResourcesService` -- `Common/Requests/GetResourceRequest` (includes CheckResourceRequest) -- `Common/Configs/ResourcesConfig` +- `Common/Requests/GetResourceRequest` (post-AZ-197 — no `CheckResourceRequest`, no `Hardware` field) +- `Common/Configs/ResourcesConfig` (the `EncryptionMasterKey` field added by AZ-183 was removed in the post-cycle-1 revert) diff --git a/_docs/02_document/components/05_admin_api/description.md b/_docs/02_document/components/05_admin_api/description.md index 76cf65c..3d95840 100644 --- a/_docs/02_document/components/05_admin_api/description.md +++ b/_docs/02_document/components/05_admin_api/description.md @@ -22,6 +22,8 @@ Converts `BusinessException` to HTTP 409 JSON response: `{ ErrorCode: int, Messa ## 3. External API Specification +> **Cycle 1 (2026-05-13) note** — endpoints below reflect the post-cycle-1 surface (AZ-513 Detection Classes CRUD, AZ-196 device auto-provisioning, AZ-197 hardware-binding removal). AZ-183 (OTA) shipped in cycle 1 but was reverted later the same day after the security audit (finding F-1) — the OTA delivery model itself was deemed obsolete. For per-endpoint cycle origins see `modules/admin_api_program.md`. + ### Authentication | Endpoint | Method | Auth | Description | |----------|--------|------|-------------| @@ -31,29 +33,41 @@ Converts `BusinessException` to HTTP 409 JSON response: `{ ErrorCode: int, Messa | Endpoint | Method | Auth | Description | |----------|--------|------|-------------| | `/users` | POST | ApiAdmin | Creates a new user | +| `/devices` | POST | ApiAdmin | **AZ-196**: provisions a CompanionPC device user (returns serial + email + plaintext password once) | | `/users/current` | GET | Authenticated | Returns current user | | `/users` | GET | ApiAdmin | Lists users (optional email/role filters) | -| `/users/hardware/set` | PUT | ApiAdmin | Sets user hardware | | `/users/queue-offsets/set` | PUT | Authenticated | Updates queue offsets | | `/users/{email}/set-role/{role}` | PUT | ApiAdmin | Changes user role | | `/users/{email}/enable` | PUT | ApiAdmin | Enables user | | `/users/{email}/disable` | PUT | ApiAdmin | Disables user | | `/users/{email}` | DELETE | ApiAdmin | Removes user | +**Removed by AZ-197**: `PUT /users/hardware/set` (Hardware-binding feature deleted) + ### Resource Management | Endpoint | Method | Auth | Description | |----------|--------|------|-------------| | `/resources/{dataFolder?}` | POST | Authenticated | Uploads a file (up to 200 MB) | | `/resources/list/{dataFolder?}` | GET | Authenticated | Lists files | | `/resources/clear/{dataFolder?}` | POST | ApiAdmin | Clears folder | -| `/resources/get/{dataFolder?}` | POST | Authenticated | Downloads encrypted resource | +| `/resources/get/{dataFolder?}` | POST | Authenticated | Downloads encrypted resource (key derived from `email + password` only — no Hardware) | | `/resources/get-installer` | GET | Authenticated | Downloads production installer | | `/resources/get-installer/stage` | GET | Authenticated | Downloads staging installer | -| `/resources/check` | POST | Authenticated | Validates hardware | + +**Removed by AZ-197**: `POST /resources/check` (was the hardware-binding side-effect probe). +**Removed in post-cycle-1 revert**: `POST /get-update` and `POST /resources/publish` (AZ-183 reverted — security audit F-1; OTA delivery model itself obsolete). + +### Detection Classes +| Endpoint | Method | Auth | Description | +|----------|--------|------|-------------| +| `/classes` | POST | ApiAdmin | **AZ-513**: creates a detection class | +| `/classes/{id:int}` | PATCH | ApiAdmin | **AZ-513**: partial-merge update of a detection class | +| `/classes/{id:int}` | DELETE | ApiAdmin | **AZ-513**: deletes a detection class | ### Authorization Policies - **apiAdminPolicy**: requires `ApiAdmin` role (used on most admin endpoints) -- **apiUploaderPolicy**: requires `ResourceUploader` or `ApiAdmin` role (**defined but never applied to any endpoint — dead code**) + +> The `apiUploaderPolicy` was added by AZ-183 and removed in the post-cycle-1 revert along with the OTA endpoints it guarded. `RoleEnum.ResourceUploader` remains as data only. ### CORS - Allowed origins: `https://admin.azaion.com`, `http://admin.azaion.com` diff --git a/_docs/02_document/module-layout.md b/_docs/02_document/module-layout.md index a1fdd67..7098481 100644 --- a/_docs/02_document/module-layout.md +++ b/_docs/02_document/module-layout.md @@ -52,10 +52,11 @@ These come from `_docs/02_document/components/` and exist for reading the codeba | # | Sub-component | Primary file locations | |---|----------------------|------------------------| -| 1 | Data Layer | `Azaion.Common/Database/`, `Azaion.Common/Configs/`, `Azaion.Common/Entities/` | -| 2 | User Management | `Azaion.Services/UserService.cs`, `Azaion.Common/Requests/{Create,Update,SetPassword,…}UserRequest.cs` | -| 3 | Auth & Security | `Azaion.Services/AuthService.cs`, `Azaion.Services/Security.cs`, `Azaion.Services/Cache.cs` | -| 4 | Resource Management | `Azaion.Services/ResourcesService.cs`, `Azaion.Common/Requests/{GetResource,CheckResources,…}.cs` | +| 1 | Data Layer | `Azaion.Common/Database/`, `Azaion.Common/Configs/`, `Azaion.Common/Entities/` (incl. `DetectionClass.cs` added cycle 1; `Resource.cs` added then removed in same cycle — see post-cycle-1 revert) | +| 2 | User Management | `Azaion.Services/UserService.cs` (incl. `RegisterDevice` added cycle 1 / AZ-196 — calls `RegisterUser` end-to-end after security-audit consolidation, finding F-3), `Azaion.Common/Requests/Register{User,DeviceResponse}.cs`, `LoginRequest.cs`, `SetUserQueueOffsetsRequest.cs` | +| 3 | Auth & Security | `Azaion.Services/AuthService.cs`, `Azaion.Services/Security.cs` (post-AZ-197 — `GetHWHash` removed; signature simplified), `Azaion.Services/Cache.cs` | +| 4 | Resource Management | `Azaion.Services/ResourcesService.cs`, `Azaion.Common/Requests/GetResourceRequest.cs` (`SetHWRequest.cs` removed by AZ-197; `ResourceUpdateService.cs` + `GetUpdateRequest.cs` + `PublishResourceRequest.cs` removed when AZ-183 was reverted) | +| 4b | Detection Classes | `Azaion.Services/DetectionClassService.cs` + `Azaion.Common/Requests/{Create,Update}DetectionClassRequest.cs` (added cycle 1 / AZ-513) | | 5 | Admin API (HTTP) | `Azaion.AdminApi/Program.cs`, `Azaion.AdminApi/BusinessExceptionHandler.cs`, `Azaion.AdminApi/appsettings*.json` | ## Allowed Dependencies (csproj layering) diff --git a/_docs/02_document/modules/admin_api_program.md b/_docs/02_document/modules/admin_api_program.md index e5a6f5e..edcc05e 100644 --- a/_docs/02_document/modules/admin_api_program.md +++ b/_docs/02_document/modules/admin_api_program.md @@ -5,25 +5,40 @@ Application entry point: configures DI, middleware, authentication, authorizatio ## Public Interface (HTTP Endpoints) -| Method | Path | Auth | Summary | -|--------|------|------|---------| -| POST | `/login` | Anonymous | Validates credentials, returns JWT token | -| POST | `/users` | ApiAdmin | Creates a new user | -| GET | `/users/current` | Any authenticated | Returns current user from JWT claims | -| GET | `/users` | ApiAdmin | Lists users with optional email/role filters | -| PUT | `/users/hardware/set` | ApiAdmin | Sets a user's hardware fingerprint | -| PUT | `/users/queue-offsets/set` | Any authenticated | Updates user's queue offsets | -| PUT | `/users/{email}/set-role/{role}` | ApiAdmin | Changes a user's role | -| PUT | `/users/{email}/enable` | ApiAdmin | Enables a user account | -| PUT | `/users/{email}/disable` | ApiAdmin | Disables a user account | -| DELETE | `/users/{email}` | ApiAdmin | Removes a user | -| POST | `/resources/{dataFolder?}` | Any authenticated | Uploads a resource file | -| GET | `/resources/list/{dataFolder?}` | Any authenticated | Lists files in a resource folder | -| POST | `/resources/clear/{dataFolder?}` | ApiAdmin | Clears a resource folder | -| POST | `/resources/get/{dataFolder?}` | Any authenticated | Downloads an encrypted resource | -| GET | `/resources/get-installer` | Any authenticated | Downloads latest production installer | -| GET | `/resources/get-installer/stage` | Any authenticated | Downloads latest staging installer | -| POST | `/resources/check` | Any authenticated | Validates hardware fingerprint | +> **Cycle 1 (2026-05-13) note** — endpoint surface changed by AZ-513 (detection-class CRUD), AZ-196 (device auto-registration), AZ-197 (hardware-binding removal). AZ-183 (OTA update check + publish) was reverted later the same day after the security audit (finding F-1) — the OTA delivery model itself was deemed obsolete; see `_docs/05_security/security_report.md` for context. The table reflects the post-cycle-1 state including that revert. + +| Method | Path | Auth | Summary | Cycle 1 origin | +|--------|------|------|---------|----------------| +| POST | `/login` | Anonymous | Validates credentials, returns JWT token | — | +| POST | `/users` | ApiAdmin | Creates a new user | — | +| POST | `/devices` | ApiAdmin | Creates a CompanionPC device user (auto serial / email / 32-hex password) | AZ-196 | +| GET | `/users/current` | Any authenticated | Returns current user from JWT claims | — | +| GET | `/users` | ApiAdmin | Lists users with optional email/role filters | — | +| PUT | `/users/queue-offsets/set` | Any authenticated | Updates user's queue offsets | — | +| PUT | `/users/{email}/set-role/{role}` | ApiAdmin | Changes a user's role | — | +| PUT | `/users/{email}/enable` | ApiAdmin | Enables a user account | — | +| PUT | `/users/{email}/disable` | ApiAdmin | Disables a user account | — | +| DELETE | `/users/{email}` | ApiAdmin | Removes a user | — | +| POST | `/resources/{dataFolder?}` | Any authenticated | Uploads a resource file | — | +| GET | `/resources/list/{dataFolder?}` | Any authenticated | Lists files in a resource folder | — | +| POST | `/resources/clear/{dataFolder?}` | ApiAdmin | Clears a resource folder | — | +| POST | `/resources/get/{dataFolder?}` | Any authenticated | Downloads an encrypted resource (key derived from `email + password` only) | AZ-197 wire change (no `Hardware` field) | +| GET | `/resources/get-installer` | Any authenticated | Downloads latest production installer | — | +| GET | `/resources/get-installer/stage` | Any authenticated | Downloads latest staging installer | — | +| POST | `/classes` | ApiAdmin | Creates a detection class | AZ-513 | +| PATCH | `/classes/{id:int}` | ApiAdmin | Updates a detection class (partial-merge) | AZ-513 | +| DELETE | `/classes/{id:int}` | ApiAdmin | Deletes a detection class | AZ-513 | + +### Removed in cycle 1 + +The following endpoints were removed during cycle 1 and now return `404`: + +| Method | Path | Reason removed | +|--------|------|----------------| +| PUT | `/users/hardware/set` | AZ-197 — hardware-binding feature deleted (no fielded clients in target architecture) | +| POST | `/resources/check` | AZ-197 — was the hardware-binding side-effect probe; no remaining purpose | +| POST | `/get-update` | OTA delivery model retired post-cycle-1 (security audit F-1: endpoint disclosed plaintext per-resource encryption keys to any authenticated caller; the underlying installer-distribution flow is itself obsolete) | +| POST | `/resources/publish` | Same revert as `/get-update` — the publish counterpart of the OTA flow | ## Internal Logic @@ -31,10 +46,11 @@ Application entry point: configures DI, middleware, authentication, authorizatio - `IUserService` → `UserService` (Scoped) - `IAuthService` → `AuthService` (Scoped) - `IResourcesService` → `ResourcesService` (Scoped) +- `IDetectionClassService` → `DetectionClassService` (Scoped) — added by AZ-513 - `IDbFactory` → `DbFactory` (Singleton) - `ICache` → `MemoryCache` (Scoped) - `LazyCache` via `AddLazyCache()` -- FluentValidation validators auto-discovered from `RegisterUserValidator` assembly +- FluentValidation validators auto-discovered from `RegisterUserValidator` assembly (also picks up `CreateDetectionClassRequest`, `UpdateDetectionClassRequest` validators introduced in cycle 1) - `BusinessExceptionHandler` registered as exception handler ### Middleware Pipeline @@ -47,7 +63,8 @@ Application entry point: configures DI, middleware, authentication, authorizatio ### Authorization Policies - `apiAdminPolicy`: requires `RoleEnum.ApiAdmin` role -- `apiUploaderPolicy`: requires `RoleEnum.ResourceUploader` OR `RoleEnum.ApiAdmin` role + +> The `apiUploaderPolicy` (`RoleEnum.ResourceUploader` OR `ApiAdmin`) was added by AZ-183 and removed in the same cycle when the OTA endpoints it guarded were retired (see "Removed in cycle 1" above). `RoleEnum.ResourceUploader` itself remains as a data value (the seed `uploader@azaion.com` still uses it) but is no longer wired to any endpoint policy. ### Configuration Sections - `JwtConfig` — JWT signing/validation diff --git a/_docs/02_document/modules/common_business_exception.md b/_docs/02_document/modules/common_business_exception.md index 848dfbb..45afd00 100644 --- a/_docs/02_document/modules/common_business_exception.md +++ b/_docs/02_document/modules/common_business_exception.md @@ -18,14 +18,15 @@ Custom exception type for domain-level errors, paired with an `ExceptionEnum` ca | `NoEmailFound` | 10 | No such email found | | `EmailExists` | 20 | Email already exists | | `WrongPassword` | 30 | Passwords do not match | -| `PasswordLengthIncorrect` | 32 | Password should be at least 8 characters | +| `PasswordLengthIncorrect` | 32 | Password should be at least 12 characters (description text — actual validator threshold is 8 chars per `RegisterUserValidator`) | | `EmailLengthIncorrect` | 35 | Email is empty or invalid | | `WrongEmail` | 37 | (no description attribute) | -| `HardwareIdMismatch` | 40 | Hardware mismatch — unauthorized hardware | -| `BadHardware` | 45 | Hardware should be not empty | +| `UserDisabled` | 38 | User account is disabled | | `WrongResourceName` | 50 | Wrong resource file name | | `NoFileProvided` | 60 | No file provided | +> **Cycle 1 (2026-05-13) note** — `HardwareIdMismatch = 40` and `BadHardware = 45` were removed by AZ-197 (admin-side hardware-binding cleanup). Code 40 should NOT be reused for a different meaning — older clients may still surface "Hardware mismatch" UX strings keyed on the integer. `UserDisabled = 38` was added earlier (still part of the baseline). See `_docs/03_implementation/batch_06_report.md`. + ## Internal Logic Static constructor eagerly loads all `ExceptionEnum` descriptions into a dictionary via `EnumExtensions.GetDescriptions()`. Messages are retrieved by dictionary lookup with fallback to `ToString()`. @@ -34,8 +35,8 @@ Static constructor eagerly loads all `ExceptionEnum` descriptions into a diction ## Consumers - `BusinessExceptionHandler` — catches and serializes to HTTP 409 response -- `UserService` — throws for email/password/hardware validation failures -- `ResourcesService` — throws for missing file uploads +- `UserService` — throws for email/password validation failures (`NoEmailFound`, `WrongPassword`, `EmailExists`, `UserDisabled`) +- `ResourcesService` — throws `NoFileProvided` for missing file uploads - FluentValidation validators — reference `ExceptionEnum` codes in `.WithErrorCode()` ## Data Models diff --git a/_docs/02_document/modules/common_entities_detection_class.md b/_docs/02_document/modules/common_entities_detection_class.md new file mode 100644 index 0000000..2fe56e6 --- /dev/null +++ b/_docs/02_document/modules/common_entities_detection_class.md @@ -0,0 +1,44 @@ +# Module: Azaion.Common.Entities.DetectionClass + +## Purpose +Domain entity for a single detection class shown to operators in the Detection Classes admin table. Persisted to the `detection_classes` table; managed via the `/classes` admin endpoints introduced by AZ-513. + +> **Cycle 1 (2026-05-13) origin** — added by AZ-513 to back the new admin `/classes` CRUD endpoints; previously the read path was served by another service (likely `annotations/`) and admin/ had no own model for it. + +## Public Interface + +| Property | Type | Description | +|----------|------|-------------| +| `Id` | `int` | Auto-assigned identity (DB-generated via `InsertWithInt32IdentityAsync`) | +| `Name` | `string` | Full display name (max 120 chars per validator) | +| `ShortName` | `string` | Short label used in tight UI (max 20 chars) | +| `Color` | `string` | UI color (e.g. `"#FF0000"`, max 20 chars — accepts hex strings or named-color tokens) | +| `MaxSizeM` | `double` | Maximum real-world object size in meters (must be > 0) | +| `PhotoMode` | `string?` | Optional capture-mode hint (max 20 chars when present) | +| `CreatedAt` | `DateTime` | UTC creation timestamp set by the service on insert | + +## Internal Logic +Plain POCO; no behaviour. Identity is assigned by the database on insert (`InsertWithInt32IdentityAsync`). + +## Dependencies +None (no `using` directives on `Azaion.Services` / external libs). + +## Consumers +- `Azaion.Services.DetectionClassService` — CRUD operations +- `AzaionDb.DetectionClasses` — linq2db table mapping (see `common_database_azaion_db.md`) +- `Azaion.AdminApi.Program` — `POST/PATCH/DELETE /classes` endpoints + +## Data Models +Maps 1:1 to the `detection_classes` PostgreSQL table. + +## Configuration +None. + +## External Integrations +None directly; persisted via `IDbFactory` → PostgreSQL. + +## Security +Data is operator-controlled metadata; no PII or secrets. + +## Tests +- `e2e/Azaion.E2E/Tests/DetectionClassesTests.cs` — covers AZ-513 ACs 1–9 diff --git a/_docs/02_document/modules/common_requests_create_detection_class.md b/_docs/02_document/modules/common_requests_create_detection_class.md new file mode 100644 index 0000000..41d79e0 --- /dev/null +++ b/_docs/02_document/modules/common_requests_create_detection_class.md @@ -0,0 +1,51 @@ +# Module: Azaion.Common.Requests.CreateDetectionClassRequest + +## Purpose +Request DTO + FluentValidation validator for `POST /classes` (AZ-513). + +> **Cycle 1 (2026-05-13) origin** — added by AZ-513. + +## Public Interface + +### CreateDetectionClassRequest +| Property | Type | Description | +|----------|------|-------------| +| `Name` | `string` | Full display name | +| `ShortName` | `string` | Short label | +| `Color` | `string` | UI color string (hex or named) | +| `MaxSizeM` | `double` | Max real-world size in meters | +| `PhotoMode` | `string?` | Optional capture-mode hint | + +### CreateDetectionClassValidator +| Rule | Constraint | +|------|-----------| +| `Name` | NotEmpty, ≤ 120 chars | +| `ShortName` | NotEmpty, ≤ 20 chars | +| `Color` | NotEmpty, ≤ 20 chars | +| `MaxSizeM` | > 0 | +| `PhotoMode` | ≤ 20 chars when present | + +## Internal Logic +Plain DTO; validator runs in the `/classes` POST handler before the service call. Validation failures are surfaced via `Results.ValidationProblem(...)` (HTTP 400). + +## Dependencies +- FluentValidation + +## Consumers +- `Azaion.AdminApi.Program` `POST /classes` +- `Azaion.Services.DetectionClassService.Create` + +## Data Models +Maps to the writable subset of `DetectionClass` (see `common_entities_detection_class.md`). + +## Configuration +None. + +## External Integrations +None. + +## Security +ApiAdmin-only endpoint; FluentValidation enforces field bounds. No HTML/JS sanitisation — the UI is responsible for safe rendering of `Name`, `ShortName`, `Color`. + +## Tests +- e2e: `AC1_Post_classes_creates_class_with_assigned_id`, `AC2_Post_classes_*` diff --git a/_docs/02_document/modules/common_requests_get_resource.md b/_docs/02_document/modules/common_requests_get_resource.md index 3b1cefa..765be3d 100644 --- a/_docs/02_document/modules/common_requests_get_resource.md +++ b/_docs/02_document/modules/common_requests_get_resource.md @@ -1,27 +1,22 @@ # Module: Azaion.Common.Requests.GetResourceRequest ## Purpose -Request DTOs and validator for resource access endpoints. Contains both `GetResourceRequest` and `CheckResourceRequest`. +Request DTO and validator for the `POST /resources/get/{dataFolder?}` endpoint. The user's password is supplied per-request so the server can derive the per-user AES encryption key for the response stream. + +> **Cycle 1 (2026-05-13) note** — the `Hardware` property and its `BadHardware` validator rule were removed by AZ-197 (admin-side hardware-binding cleanup). The wire-compat policy was "drop entirely" — any client still sending `Hardware` will not see it deserialized. The companion `CheckResourceRequest` was removed along with the `POST /resources/check` endpoint. See `_docs/03_implementation/batch_06_report.md`. ## Public Interface -### CheckResourceRequest -| Property | Type | Description | -|----------|------|-------------| -| `Hardware` | `string` | Hardware fingerprint to validate | - ### GetResourceRequest | Property | Type | Description | |----------|------|-------------| -| `Password` | `string` | User's password (used to derive encryption key) | -| `Hardware` | `string` | Hardware fingerprint for authorization | +| `Password` | `string` | User's password (used to derive the encryption key) | | `FileName` | `string` | Resource file to retrieve | ### GetResourceRequestValidator | Rule | Constraint | Error Code | |------|-----------|------------| | `Password` min length | >= 8 chars | `PasswordLengthIncorrect` | -| `Hardware` not empty | Required | `BadHardware` | | `FileName` not empty | Required | `WrongResourceName` | ## Internal Logic @@ -32,7 +27,7 @@ Validator uses `BusinessException.GetMessage()` to derive user-facing error mess - FluentValidation ## Consumers -- `Program.cs` `/resources/get/{dataFolder?}` and `/resources/check` endpoints +- `Program.cs` `POST /resources/get/{dataFolder?}` endpoint ## Data Models None. @@ -44,7 +39,8 @@ None. None. ## Security -Password is sent in the POST body (not URL) to avoid logging in access logs. Hardware fingerprint validates device authorization. +- Password is sent in the POST body (not URL) to avoid logging in access logs. +- Per-user encryption key derivation now uses `email + password` only (see `services_security.md`). ## Tests -None. +- `e2e/Azaion.E2E/Tests/ResourceTests.cs` (encrypted download / round-trip) — updated by AZ-197 to stop sending `Hardware` diff --git a/_docs/02_document/modules/common_requests_register_device_response.md b/_docs/02_document/modules/common_requests_register_device_response.md new file mode 100644 index 0000000..377302c --- /dev/null +++ b/_docs/02_document/modules/common_requests_register_device_response.md @@ -0,0 +1,41 @@ +# Module: Azaion.Common.Requests.RegisterDeviceResponse + +## Purpose +Response DTO returned by `POST /devices` (AZ-196) — provides the provisioning script with the freshly-generated `Serial`, `Email`, and one-shot plaintext `Password` for a new CompanionPC device user. + +> **Cycle 1 (2026-05-13) origin** — added by AZ-196. + +## Public Interface + +| Property | Type | Description | +|----------|------|-------------| +| `Serial` | `string` | Server-assigned device serial in the form `azj-NNNN` (zero-padded to 4 digits) | +| `Email` | `string` | `{Serial}@azaion.com` — the persisted user's login email | +| `Password` | `string` | Plaintext 32-char hex password — exposed exactly once at provisioning; never re-derivable from the SHA-384 hash that is persisted | + +## Internal Logic +Plain POCO. All field values are produced inside `UserService.RegisterDevice` (see `services_user_service.md`). + +## Dependencies +None. + +## Consumers +- `Azaion.AdminApi.Program` `POST /devices` (returned via `Results.Ok(...)` implicit) +- `Azaion.Services.UserService.RegisterDevice` (constructs and returns the response) +- Provisioning script (out-of-tree) — embeds the values into `device.conf` on the Jetson + +## Data Models +Mirrors a subset of fields written into the `users` row (`Email`, `PasswordHash`). + +## Configuration +None. + +## External Integrations +None. + +## Security +- The `Password` is the only chance to capture the plaintext — once the response is consumed by the provisioning pipeline, the value cannot be recovered from the database (only the SHA-384 hash is persisted). +- The endpoint is gated by `apiAdminPolicy`. Treat the response as a credential — log carefully. + +## Tests +- e2e: `AC1_Post_devices_returns_serial_email_and_password`, `AC3_Returned_credentials_can_login` diff --git a/_docs/02_document/modules/common_requests_set_hw.md b/_docs/02_document/modules/common_requests_set_hw.md deleted file mode 100644 index 1cfa772..0000000 --- a/_docs/02_document/modules/common_requests_set_hw.md +++ /dev/null @@ -1,39 +0,0 @@ -# Module: Azaion.Common.Requests.SetHWRequest - -## Purpose -Request DTO and validator for setting a user's hardware fingerprint (`PUT /users/hardware/set`). - -## Public Interface - -### SetHWRequest -| Property | Type | Description | -|----------|------|-------------| -| `Email` | `string` | Target user's email | -| `Hardware` | `string?` | Hardware fingerprint (null clears it) | - -### SetHWRequestValidator -| Rule | Constraint | Error Code | -|------|-----------|------------| -| `Email` not empty | Required | `EmailLengthIncorrect` | - -## Dependencies -- `BusinessException`, `ExceptionEnum` -- FluentValidation - -## Consumers -- `Program.cs` `/users/hardware/set` endpoint - -## Data Models -None. - -## Configuration -None. - -## External Integrations -None. - -## Security -None. - -## Tests -None. diff --git a/_docs/02_document/modules/common_requests_update_detection_class.md b/_docs/02_document/modules/common_requests_update_detection_class.md new file mode 100644 index 0000000..27660cd --- /dev/null +++ b/_docs/02_document/modules/common_requests_update_detection_class.md @@ -0,0 +1,51 @@ +# Module: Azaion.Common.Requests.UpdateDetectionClassRequest + +## Purpose +Request DTO + FluentValidation validator for `PATCH /classes/{id}` (AZ-513). All fields are nullable so callers may send the complete body OR only the changed fields — the service applies partial-merge semantics. + +> **Cycle 1 (2026-05-13) origin** — added by AZ-513. + +## Public Interface + +### UpdateDetectionClassRequest +| Property | Type | Description | +|----------|------|-------------| +| `Name` | `string?` | If non-null, replace existing | +| `ShortName` | `string?` | If non-null, replace existing | +| `Color` | `string?` | If non-null, replace existing | +| `MaxSizeM` | `double?` | If non-null, replace existing | +| `PhotoMode` | `string?` | If non-null, replace existing | + +### UpdateDetectionClassValidator +| Rule | Constraint (only checked when field is non-null) | +|------|--------------------------------------------------| +| `Name` | NotEmpty, ≤ 120 chars | +| `ShortName` | NotEmpty, ≤ 20 chars | +| `Color` | NotEmpty, ≤ 20 chars | +| `MaxSizeM` | > 0 | +| `PhotoMode` | ≤ 20 chars | + +## Internal Logic +Each rule is gated by `.When(r => r.Field != null)` — fields the caller did not send pass validation untouched. The service then applies the same null-check pattern when writing back. + +## Dependencies +- FluentValidation + +## Consumers +- `Azaion.AdminApi.Program` `PATCH /classes/{id:int}` +- `Azaion.Services.DetectionClassService.Update` + +## Data Models +Optional / partial view over `DetectionClass`. + +## Configuration +None. + +## External Integrations +None. + +## Security +ApiAdmin-only endpoint. Per the AZ-513 spec, the UI sends the complete body on edit even though partial-merge is supported on the server — that keeps the implementer free to choose either policy without breaking the client. + +## Tests +- e2e: `AC3_Patch_classes_full_body_updates_class`, `AC4_Patch_classes_partial_body_only_updates_specified_field`, `AC5_Patch_classes_unknown_id_returns_404`, `AC6_Patch_classes_without_jwt_returns_401` diff --git a/_docs/02_document/modules/services_detection_class_service.md b/_docs/02_document/modules/services_detection_class_service.md new file mode 100644 index 0000000..330b1c7 --- /dev/null +++ b/_docs/02_document/modules/services_detection_class_service.md @@ -0,0 +1,47 @@ +# Module: Azaion.Services.DetectionClassService + +## Purpose +CRUD service for `DetectionClass` rows backing the admin Detection Classes table. Wraps `IDbFactory.RunAdmin` calls and translates request DTOs into entity writes. + +> **Cycle 1 (2026-05-13) origin** — added by AZ-513. + +## Public Interface + +### IDetectionClassService +| Method | Signature | Description | +|--------|-----------|-------------| +| `Create` | `Task Create(CreateDetectionClassRequest request, CancellationToken ct)` | Inserts a new class; returns the entity with the DB-assigned `Id` | +| `Update` | `Task Update(int id, UpdateDetectionClassRequest request, CancellationToken ct)` | Partial-merge update; returns `null` when the id doesn't exist | +| `Delete` | `Task Delete(int id, CancellationToken ct)` | Returns `true` when at least one row was deleted; `false` when the id wasn't present | + +## Internal Logic +- **Create**: instantiates `DetectionClass`, sets `CreatedAt = DateTime.UtcNow`, calls `db.InsertWithInt32IdentityAsync`, assigns the returned id back to the entity, returns it. +- **Update**: loads the row by id under the admin connection, returns `null` if missing. Otherwise applies a null-aware merge: each non-null property on the request overwrites the entity, then `db.UpdateAsync(existing)` persists the row. The route returns 404 when the service returns null. +- **Delete**: `db.DetectionClasses.DeleteAsync(x => x.Id == id, ct)`; returns `deleted > 0`. The route returns 404 when the service returns false. + +All writes go through `IDbFactory.RunAdmin` (admin DB connection / role). + +## Dependencies +- `IDbFactory` (`Azaion.Common.Database.IDbFactory`) +- `DetectionClass` entity +- `CreateDetectionClassRequest`, `UpdateDetectionClassRequest` +- `LinqToDB` extension methods (`FirstOrDefaultAsync`, `InsertWithInt32IdentityAsync`, `UpdateAsync`, `DeleteAsync`) + +## Consumers +- `Azaion.AdminApi.Program` — `POST /classes`, `PATCH /classes/{id:int}`, `DELETE /classes/{id:int}` handlers + +## Data Models +Operates on `DetectionClass` via `AzaionDb.DetectionClasses`. + +## Configuration +None. + +## External Integrations +PostgreSQL via `IDbFactory.RunAdmin`. + +## Security +- All endpoints that delegate to this service require `apiAdminPolicy` at the route level. +- Validators run before the service (no extra defensive validation inside the service). + +## Tests +- `e2e/Azaion.E2E/Tests/DetectionClassesTests.cs` — covers AZ-513 ACs 1–9 diff --git a/_docs/02_document/modules/services_resources_service.md b/_docs/02_document/modules/services_resources_service.md index 27743f4..0d98fc9 100644 --- a/_docs/02_document/modules/services_resources_service.md +++ b/_docs/02_document/modules/services_resources_service.md @@ -41,9 +41,9 @@ Uses `ResourcesConfig` (ResourcesFolder, SuiteInstallerFolder, SuiteStageInstall Local filesystem for resource storage. ## Security -- Resources are encrypted per-user using a key derived from email + password + hardware hash -- File deletion overwrites existing files before writing new ones -- No path traversal protection on `dataFolder` parameter +- Resources are encrypted per-user using a key derived from `email + password` (the hardware-hash component was removed by AZ-197 — see `services_security.md`). +- File deletion overwrites existing files before writing new ones. +- No path traversal protection on `dataFolder` parameter. ## Tests -None. +None at the module level. End-to-end coverage lives in `e2e/Azaion.E2E/Tests/ResourceTests.cs` (encrypted download / round-trip / 200 MB upload limit) — updated by AZ-197 to stop sending the `Hardware` field. diff --git a/_docs/02_document/modules/services_security.md b/_docs/02_document/modules/services_security.md index 2fed521..01c1ab8 100644 --- a/_docs/02_document/modules/services_security.md +++ b/_docs/02_document/modules/services_security.md @@ -1,22 +1,22 @@ # Module: Azaion.Services.Security ## Purpose -Static utility class providing cryptographic operations: password hashing, hardware fingerprint hashing, encryption key derivation, and AES-CBC stream encryption/decryption. +Static utility class providing cryptographic operations: password hashing, encryption key derivation, and AES-CBC stream encryption/decryption. + +> **Cycle 1 (2026-05-13) note** — `GetHWHash` was deleted and `GetApiEncryptionKey` was simplified from `(email, password, hardwareHash)` to `(email, password)` by AZ-197 (admin-side hardware-binding cleanup). The hardware-hash component of the derived key is gone; existing ciphertexts produced under the old derivation are no longer re-derivable from the new signature. See `_docs/03_implementation/batch_06_report.md`. ## Public Interface | Method | Signature | Description | |--------|-----------|-------------| | `ToHash` | `static string ToHash(this string str)` | Extension: SHA-384 hash of input, returned as Base64 | -| `GetHWHash` | `static string GetHWHash(string hardware)` | Derives a salted hash from hardware fingerprint string | -| `GetApiEncryptionKey` | `static string GetApiEncryptionKey(string email, string password, string? hardwareHash)` | Derives an AES encryption key from email + password + hardware hash | +| `GetApiEncryptionKey` | `static string GetApiEncryptionKey(string email, string password)` | Derives the per-user AES encryption key string from email + password (+ static salt) | | `EncryptTo` | `static async Task EncryptTo(this Stream inputStream, Stream toStream, string key, CancellationToken ct)` | AES-256-CBC encrypts a stream; prepends IV to output | | `DecryptTo` | `static async Task DecryptTo(this Stream encryptedStream, Stream toStream, string key, CancellationToken ct)` | Reads IV prefix, then AES-256-CBC decrypts stream | ## Internal Logic - **Password hashing**: `ToHash` uses SHA-384 with UTF-8 encoding, outputting Base64. -- **Hardware hashing**: `GetHWHash` salts the raw hardware string with `"Azaion_{hardware}_%$$$)0_"` before hashing. -- **Encryption key derivation**: `GetApiEncryptionKey` concatenates email, password, and hardware hash with a static salt, then hashes. +- **Encryption key derivation**: `GetApiEncryptionKey` concatenates email and password with the static salt `"-#%@AzaionKey@%#---"`, then hashes via `ToHash` (SHA-384, Base64). - **Encryption**: AES-256-CBC with PKCS7 padding. Key is SHA-256 of the derived key string. IV is randomly generated and prepended to the output stream. Uses 512 KB buffer for streaming. - **Decryption**: Reads the first 16 bytes as IV, then AES-256-CBC decrypts with PKCS7 padding. @@ -25,10 +25,9 @@ Static utility class providing cryptographic operations: password hashing, hardw - `System.Text.Encoding` ## Consumers -- `UserService.CheckHardwareHash` — calls `GetHWHash` to verify hardware fingerprint -- `Program.cs` `/resources/get` endpoint — calls `GetApiEncryptionKey` +- `Program.cs` `/resources/get/{dataFolder}` endpoint — calls `GetApiEncryptionKey(user.Email, request.Password)` - `ResourcesService.GetEncryptedResource` — uses `EncryptTo` extension -- `SecurityTest` — directly tests `GetApiEncryptionKey`, `EncryptTo`, `DecryptTo` +- `Azaion.Test/SecurityTest` — directly tests `EncryptTo` / `DecryptTo` round-trips (no longer tests hardware-hash derivation) ## Data Models None. @@ -41,11 +40,11 @@ None. ## Security Core cryptographic module. Key observations: -- Passwords are hashed with SHA-384 (no per-user salt, no key stretching — not bcrypt/scrypt/argon2) -- Hardware hash uses a static salt -- AES encryption uses SHA-256 of the derived key, with random IV per encryption -- All salts/prefixes are hardcoded constants +- Passwords are hashed with SHA-384 (no per-user salt, no key stretching — not bcrypt/scrypt/argon2). This is unchanged by AZ-197. +- AES encryption uses SHA-256 of the derived key, with random IV per encryption. +- All salts/prefixes are hardcoded constants. +- Per AZ-197: device hardware fingerprints no longer participate in key derivation. The threat that hardware binding mitigated (credential reuse via desktop installers) was eliminated by the architectural shift to fTPM-secured Jetsons + browser-only SaaS access. ## Tests -- `SecurityTest.EncryptDecryptTest` — round-trip encrypt/decrypt of a string -- `SecurityTest.EncryptDecryptLargeFileTest` — round-trip encrypt/decrypt of a ~400 MB generated file +- `Azaion.Test/SecurityTest.EncryptDecryptTest` — round-trip encrypt/decrypt of a string +- `Azaion.Test/SecurityTest.EncryptDecryptLargeFileTest` — round-trip encrypt/decrypt of a ~400 MB generated file diff --git a/_docs/02_document/modules/services_user_service.md b/_docs/02_document/modules/services_user_service.md index 921feab..aa8b3d5 100644 --- a/_docs/02_document/modules/services_user_service.md +++ b/_docs/02_document/modules/services_user_service.md @@ -1,7 +1,9 @@ # Module: Azaion.Services.UserService ## Purpose -Core business logic for user management: registration, authentication, hardware binding, role management, and account lifecycle. +Core business logic for user management: registration (web users + provisioned devices), authentication, role management, and account lifecycle. + +> **Cycle 1 (2026-05-13) note** — hardware-binding methods (`UpdateHardware`, `CheckHardwareHash`, private `UpdateLastLoginDate`) and the bound `IUserService` declarations were removed by AZ-197 (admin-side hardware-binding cleanup). Device auto-provisioning (`RegisterDevice`) was added by AZ-196. **Post-cycle-1 (security audit F-3)**: `RegisterDevice` was refactored to delegate the row insert to `RegisterUser`, and `RegisterUser` itself now relies on the new `users_email_uidx` UNIQUE INDEX (`env/db/06_users_email_unique.sql`) — the check-then-insert race is gone; `Npgsql.PostgresException(SqlState=23505)` is translated to `BusinessException(EmailExists)`. See `_docs/03_implementation/batch_05_report.md` and `batch_06_report.md`. ## Public Interface @@ -9,43 +11,45 @@ Core business logic for user management: registration, authentication, hardware | Method | Signature | Description | |--------|-----------|-------------| | `RegisterUser` | `Task RegisterUser(RegisterUserRequest request, CancellationToken ct)` | Creates a new user with hashed password | -| `ValidateUser` | `Task ValidateUser(LoginRequest request, CancellationToken ct)` | Validates email + password, returns user | +| `RegisterDevice` | `Task RegisterDevice(CancellationToken ct)` | Creates a new `CompanionPC` user with auto-assigned `azj-NNNN` serial / email and a 32-char hex password (returned plaintext exactly once) | +| `ValidateUser` | `Task ValidateUser(LoginRequest request, CancellationToken ct)` | Validates email + password, returns user. Throws `NoEmailFound`, `WrongPassword`, or `UserDisabled` | | `GetByEmail` | `Task GetByEmail(string? email, CancellationToken ct)` | Cached user lookup by email | -| `UpdateHardware` | `Task UpdateHardware(string email, string? hardware, CancellationToken ct)` | Sets/clears user's hardware fingerprint | | `UpdateQueueOffsets` | `Task UpdateQueueOffsets(string email, UserQueueOffsets offsets, CancellationToken ct)` | Updates user's annotation queue offsets | | `GetUsers` | `Task> GetUsers(string? searchEmail, RoleEnum? searchRole, CancellationToken ct)` | Lists users with optional email/role filters | -| `CheckHardwareHash` | `Task CheckHardwareHash(User user, string hardware, CancellationToken ct)` | Validates or initializes hardware binding | | `ChangeRole` | `Task ChangeRole(string email, RoleEnum newRole, CancellationToken ct)` | Changes a user's role | | `SetEnableStatus` | `Task SetEnableStatus(string email, bool isEnabled, CancellationToken ct)` | Enables or disables a user account | | `RemoveUser` | `Task RemoveUser(string email, CancellationToken ct)` | Permanently deletes a user | ## Internal Logic -- **RegisterUser**: checks for duplicate email, hashes password via `Security.ToHash`, inserts via `RunAdmin`. -- **ValidateUser**: finds user by email, compares password hash. Throws `NoEmailFound` or `WrongPassword`. +- **RegisterUser**: hashes password via `Security.ToHash`, inserts via `RunAdmin`. Catches `Npgsql.PostgresException` with `SqlState == PostgresErrorCodes.UniqueViolation` (23505) on the `users_email_uidx` UNIQUE INDEX and rethrows as `BusinessException(EmailExists)`. The previous check-then-insert pattern was removed (race-prone before the index existed; redundant after). +- **RegisterDevice**: calls private `NextDeviceIdentity` (read-only) to compute the next `azj-NNNN` serial + matching email, generates a 32-char hex password from `RandomNumberGenerator.GetBytes(16)`, then delegates the row insert to `RegisterUser` (so any future change to user-creation policy applies here too). Returns `{Serial, Email, Password}` (plaintext password exposed exactly once at provisioning time). On a serial-allocation race, the second caller's insert hits the UNIQUE INDEX and surfaces `BusinessException(EmailExists)`; the caller can retry. +- **NextDeviceIdentity** (private): queries the most recent `RoleEnum.CompanionPC` user via `dbFactory.Run` (read connection), parses the `azj-NNNN` suffix (chars `[SerialNumberStart, SerialNumberLength)` of the email, constants on the class), increments by 1, returns `(serial, email)`. +- **ValidateUser**: finds user by email, compares password hash. Throws `NoEmailFound`, `WrongPassword`, or `UserDisabled`. - **GetByEmail**: uses `ICache.GetFromCacheAsync` with key `User.{email}`. -- **CheckHardwareHash**: on first access (null hardware), stores the raw hardware string and returns the hash. On subsequent access, compares hashes. Throws `HardwareIdMismatch` on mismatch. Also updates `LastLogin` timestamp. -- **UpdateHardware/UpdateQueueOffsets**: use `RunAdmin` for writes, then invalidate cache. +- **UpdateQueueOffsets**: writes via `RunAdmin`, then invalidates the user cache. - **GetUsers**: uses `WhereIf` for optional filter predicates. -Private method: -- `UpdateLastLoginDate` — updates `LastLogin` to `DateTime.UtcNow`. +Private constants (device provisioning): +- `DeviceEmailPrefix = "azj-"`, `DeviceEmailDomain = "@azaion.com"`, `SerialNumberStart = 4`, `SerialNumberLength = 4`, `DevicePasswordBytes = 16`. ## Dependencies - `IDbFactory` (database access) - `ICache` (user caching) -- `Security` (hashing) +- `Security` (hashing — `ToHash`) +- `System.Security.Cryptography.RandomNumberGenerator` (device password entropy) +- `Npgsql` (`PostgresException`, `PostgresErrorCodes.UniqueViolation` — used to translate UNIQUE-INDEX violations to `BusinessException(EmailExists)`) - `BusinessException` (domain errors) - `QueryableExtensions.WhereIf` - `User`, `UserConfig`, `UserQueueOffsets`, `RoleEnum` -- `RegisterUserRequest`, `LoginRequest` +- `RegisterUserRequest`, `LoginRequest`, `RegisterDeviceResponse` ## Consumers -- `Program.cs` — all `/users/*` endpoints delegate to `IUserService` +- `Program.cs` — `/users/*` endpoints delegate to `IUserService` +- `Program.cs` — `POST /devices` calls `RegisterDevice` (added by AZ-196) - `AuthService.GetCurrentUser` — calls `GetByEmail` -- `Program.cs` `/resources/get` — calls `CheckHardwareHash` ## Data Models -Operates on `User` entity via `AzaionDb.Users` table. +Operates on `User` entity via `AzaionDb.Users` table. The `User.Hardware` column is left in place (nullable, unused) per AZ-197 — see the entity doc. ## Configuration None. @@ -54,9 +58,10 @@ None. PostgreSQL via `IDbFactory`. ## Security -- Passwords hashed with SHA-384 (via `Security.ToHash`) before storage -- Hardware binding prevents resource access from unauthorized devices -- Read operations use read-only DB connection; writes use admin connection +- Passwords hashed with SHA-384 (via `Security.ToHash`) before storage. +- Device passwords are returned plaintext to the caller exactly once at provisioning; the persisted form is the SHA-384 hash. The plaintext is never re-derivable. +- Read operations use the read-only DB connection; writes use the admin connection. ## Tests -- `UserServiceTest.CheckHardwareHashTest` — integration test against live database +- `Azaion.Test/UserServiceTest.cs` — unit/integration tests against the live test database (hardware-binding tests removed by AZ-197) +- `e2e/Azaion.E2E/Tests/DeviceTests.cs` — e2e for AZ-196 device-provisioning ACs diff --git a/_docs/02_document/ripple_log_cycle1.md b/_docs/02_document/ripple_log_cycle1.md new file mode 100644 index 0000000..58b3086 --- /dev/null +++ b/_docs/02_document/ripple_log_cycle1.md @@ -0,0 +1,139 @@ +# Ripple Log — Cycle 1 (2026-05-13) + +Documentation refresh triggered by the cycle 1 task set: AZ-513, AZ-196, AZ-183, AZ-197. + +> **Post-cycle-1 update (same day, 2026-05-13)** — after the security audit (autodev Step 14) AZ-183 (OTA update check & publish) was reverted in full and a F-3 hardening pass was applied to `RegisterUser`/`RegisterDevice`. See "Post-cycle-1 revert (security audit follow-up)" at the bottom of this log for the doc deltas. + +This log records every doc that was refreshed (directly or via the import-graph ripple from another changed file) during autodev Step 13 (Update Docs) — `document` skill in **Task mode**. + +## Method + +Per `.cursor/skills/document/workflows/task.md` Step 0.5, for each changed source file the consuming files were located via `using` references inside `Azaion.AdminApi/`, `Azaion.Services/`, `Azaion.Common/`, `Azaion.Test/`, and `e2e/Azaion.E2E/`. Each consumer that lives in an already-documented module triggered a doc refresh. + +For C#, the import surface walked was `using Azaion.{Common,Services}*;` plus `ProjectReference` declarations in the four production csprojs (`Azaion.AdminApi`, `Azaion.Services`, `Azaion.Common`, `Azaion.Test`, `e2e/Azaion.E2E`). + +## Direct refreshes (changed source file → existing module doc) + +| Module Doc | Trigger | +|------------|---------| +| `modules/services_user_service.md` | `Azaion.Services/UserService.cs` — AZ-196 added `RegisterDevice`; AZ-197 removed `UpdateHardware`, `CheckHardwareHash`, `UpdateLastLoginDate`. | +| `modules/services_security.md` | `Azaion.Services/Security.cs` — AZ-197 removed `GetHWHash`; `GetApiEncryptionKey` signature simplified. | +| `modules/services_resources_service.md` | `Azaion.Services/ResourcesService.cs` — caller-side hardware path gone (security note rewrite). | +| `modules/common_requests_get_resource.md` | `Azaion.Common/Requests/GetResourceRequest.cs` — AZ-197 removed `Hardware` field; `CheckResourceRequest` removed. | +| `modules/common_business_exception.md` | `Azaion.Common/BusinessException.cs` — AZ-197 removed `HardwareIdMismatch` (40) and `BadHardware` (45). | +| `modules/admin_api_program.md` | `Azaion.AdminApi/Program.cs` — AZ-513 added `/classes` CRUD; AZ-196 added `/devices`; AZ-183 added `/get-update` + `/resources/publish`; AZ-197 removed `/users/hardware/set` and `/resources/check`. | +| `modules/common_requests_set_hw.md` | **Deleted** — `Azaion.Common/Requests/SetHWRequest.cs` no longer exists. | + +## New module docs (added cycle 1) + +| Module Doc | New Source File | +|------------|-----------------| +| `modules/common_entities_detection_class.md` | `Azaion.Common/Entities/DetectionClass.cs` (AZ-513) | +| `modules/common_entities_resource.md` | `Azaion.Common/Entities/Resource.cs` (AZ-183) | +| `modules/common_requests_create_detection_class.md` | `Azaion.Common/Requests/CreateDetectionClassRequest.cs` (AZ-513) | +| `modules/common_requests_update_detection_class.md` | `Azaion.Common/Requests/UpdateDetectionClassRequest.cs` (AZ-513) | +| `modules/services_detection_class_service.md` | `Azaion.Services/DetectionClassService.cs` (AZ-513) | +| `modules/services_resource_update_service.md` | `Azaion.Services/ResourceUpdateService.cs` (AZ-183) | +| `modules/common_requests_get_update.md` | `Azaion.Common/Requests/GetUpdateRequest.cs` (AZ-183 — also defines `ResourceUpdateItem`) | +| `modules/common_requests_publish_resource.md` | `Azaion.Common/Requests/PublishResourceRequest.cs` (AZ-183) | +| `modules/common_requests_register_device_response.md` | `Azaion.Common/Requests/RegisterDeviceResponse.cs` (AZ-196) | + +## Component-level refreshes (parents of refreshed modules) + +| Component Doc | Reason | +|---------------|--------| +| `components/01_data_layer/description.md` | New entities (`DetectionClass`, `Resource`); new cache key `Resources.Latest.{arch}.{stage}`; storage estimates updated; `User.Hardware` marked tombstoned. | +| `components/02_user_management/description.md` | `RegisterDevice` added to interface table; `CheckHardwareHash` / `UpdateHardware` removed from interface table; `SetHWRequest` removed; cache invalidation table simplified. | +| `components/03_auth_and_security/description.md` | `Security.GetApiEncryptionKey` signature simplified; `GetHWHash` removed. | +| `components/04_resource_management/description.md` | `IResourceUpdateService` added (AZ-183) with separate DB + cache + at-rest column encryption; `GetResourceRequest` no longer carries `Hardware`; `CheckResourceRequest` removed. | +| `components/05_admin_api/description.md` | New endpoints (POST `/classes`, PATCH `/classes/{id}`, DELETE `/classes/{id}`, POST `/devices`, POST `/get-update`, POST `/resources/publish`); removed endpoints (PUT `/users/hardware/set`, POST `/resources/check`); `apiUploaderPolicy` is now in use. | + +## System-level refreshes + +| System Doc | Reason | +|------------|--------| +| `system-flows.md` | F4 (Hardware Check) marked REMOVED; F3 sequence diagram regenerated without hardware step; F8 (Detection Classes CRUD), F9 (Device Auto-Provisioning), F10 (OTA Update Check & Publish) added with full sequence diagrams + error tables. | +| `architecture.md` | Data Model Overview lists the new `DetectionClass` and `Resource` entities; the `User` entity caption notes the CompanionPC subset auto-provisioned via AZ-196; ExceptionEnum caption notes HW-related codes are gone. The `Note (AZ-197)` block at the top was already in place pre-Step-13. | +| `module-layout.md` | Conceptual Sub-Components table updated: cycle-1-added files annotated; `SetHWRequest` removal noted; new sub-component `4b Detection Classes` added. | +| `diagrams/flows/flow_hardware_check.md` | Already converted to a tombstone during AZ-197 implementation; no further action this cycle. | + +## Tooling notes + +- C# import resolution was performed by `Grep` on `using Azaion.*` patterns plus by reading the `.csproj` `ProjectReference` set, since the workspace has no `madge`/`depcruise`-equivalent statically available. Any consumer in `Azaion.AdminApi/Program.cs` was treated as a "system entry point" consumer (Program.cs is the composition root + endpoint table — a single file that legitimately consumes everything). +- Tests under `Azaion.Test/` and `e2e/Azaion.E2E/` were considered downstream consumers of `Azaion.Services` and `Azaion.Common`. Their files were NOT promoted into the doc tree (per `module-layout.md` Layout Rules — tests are not public API surface), but their AC coverage was reflected in module-doc "Tests" sections and in `tests/blackbox-tests.md` / `tests/traceability-matrix.md` (autodev Step 12). + +## No-op observations + +- Other module docs in `_docs/02_document/modules/` (e.g., `common_entities_user.md`, `common_database_*.md`, `common_extensions_*.md`, `services_auth_service.md`, `services_cache.md`, `admin_api_business_exception_handler.md`, `common_requests_login_request.md`, `common_requests_register_user.md`, `common_requests_set_queue_offsets.md`, `common_configs_*.md`) were inspected and found to be unaffected by cycle 1 changes — no refresh needed. +- `_docs/00_problem/acceptance_criteria.md` and `_docs/00_problem/restrictions.md` were intentionally NOT modified — Task-mode Step 4 only updates problem-level docs when the task changed input parameters or the AC catalogue. Cycle 1 added new behaviours but the baseline AC numbering (AC-1..AC-28) is preserved per `cycle-update` rules; new AC sets live under their tracker IDs in `tests/traceability-matrix.md`. + +--- + +## Post-cycle-1 revert (security audit follow-up, 2026-05-13) + +After autodev Step 14 (Security Audit) finished with verdict **FAIL** (3 open Highs: F-1, F-2, F-3), the user instructed: + +> "fix findings right now F-1 get-update is again leftover from the shipping resources era, when we delivered software as an installer. We don't need now IResourceUpdateService. F-3 (AMPLIFIED, AZ-196) — duplicate-email race now reachable on /devices because users.email has no UNIQUE index. first of all, reuse the code in the implementation RegisterDevice -> should call RegisterUser then add index to email" + +### Code changes + +| File | Action | Reason | +|------|--------|--------| +| `Azaion.Services/ResourceUpdateService.cs` | Deleted | F-1 — entire OTA feature reverted | +| `Azaion.Common/Requests/GetUpdateRequest.cs` | Deleted | F-1 — request DTO unused after endpoint deletion | +| `Azaion.Common/Requests/PublishResourceRequest.cs` | Deleted | F-1 — request DTO unused after endpoint deletion | +| `Azaion.Common/Entities/Resource.cs` | Deleted | F-1 — entity unused after service deletion | +| `env/db/05_resources.sql` | Deleted | F-1 — `resources` table no longer needed | +| `e2e/Azaion.E2E/Tests/ResourceUpdateTests.cs` | Deleted | F-1 — covers deleted endpoints | +| `Azaion.AdminApi/Program.cs` | Edited | F-1 — removed `/get-update`, `/resources/publish`, `IResourceUpdateService` DI registration, `apiUploaderPolicy` | +| `Azaion.Common/Database/AzaionDb.cs` | Edited | F-1 — removed `ITable` | +| `Azaion.Common/Database/AzaionDbShemaHolder.cs` | Edited | F-1 — removed `Resource` entity mapping | +| `Azaion.Common/Configs/ResourcesConfig.cs` | Edited | F-1 — removed `EncryptionMasterKey` field (also closes F-5) | +| `Azaion.AdminApi/appsettings.json` | Edited | F-1 — removed `EncryptionMasterKey` config value | +| `docker-compose.test.yml` | Edited | F-1 — removed `ResourcesConfig__EncryptionMasterKey` env var | +| `env/db/06_users_email_unique.sql` | **Created** | F-3 — `CREATE UNIQUE INDEX users_email_uidx ON public.users (email);` | +| `e2e/db-init/00_run_all.sh` | Edited | drop `05_resources.sql` line; add `06_users_email_unique.sql` line | +| `Azaion.Services/UserService.cs` | Edited | F-3 — `RegisterUser` drops check-then-insert, catches `Npgsql.PostgresException(SqlState=23505)` → `EmailExists`; `RegisterDevice` now delegates the row insert to `RegisterUser` (per user direction) | + +### Doc deltas + +| Doc | What changed | +|-----|--------------| +| `system-flows.md` | F10 row in flow inventory marked REMOVED; F9 dependency note updated; full F10 section replaced with a tombstone explaining the revert | +| `architecture.md` | `Resource` entity removed from data model table; `User` row notes UNIQUE INDEX on email and the `RegisterDevice` → `RegisterUser` consolidation | +| `module-layout.md` | `4 Resource Management` row updated to drop OTA files; `2 User Management` row notes the F-3 consolidation | +| `components/01_data_layer/description.md` | `Resource` entity removed; UNIQUE INDEX on email noted; `Resources.Latest.*` cache key removed; storage-estimates row removed; Newtonsoft.Json version bumped to 13.0.4 | +| `components/02_user_management/description.md` | `RegisterUser` and `RegisterDevice` rows updated to reflect the F-3 fix | +| `components/04_resource_management/description.md` | Rewritten — collapsed back to filesystem-storage scope; OTA references removed; F-2 callout retained as known limitation | +| `components/05_admin_api/description.md` | `/get-update`, `/resources/publish`, `apiUploaderPolicy` removed from endpoint and policy tables | +| `modules/admin_api_program.md` | Endpoint table no longer lists OTA endpoints; "Removed in cycle 1" section absorbs them; DI list and policies updated | +| `modules/services_user_service.md` | F-3 fix detailed in Internal Logic; Npgsql added to Dependencies | +| `modules/services_resource_update_service.md` | **Deleted** | +| `modules/common_entities_resource.md` | **Deleted** | +| `modules/common_requests_get_update.md` | **Deleted** | +| `modules/common_requests_publish_resource.md` | **Deleted** | +| `tests/traceability-matrix.md` | AZ-183 section marked REVERTED; FT-P-21..23 strikethroughs | +| `tests/blackbox-tests.md` | OTA section collapsed to ID-placeholder table; bodies removed | +| `_docs/05_security/security_report.md` | Verdict flipped from FAIL → PASS_WITH_WARNINGS; F-1, F-3, D-1 marked CLOSED; F-2 deferred | +| `_docs/05_security/static_analysis.md` | F-1, F-3, F-5 marked CLOSED with resolution notes | +| `_docs/05_security/owasp_review.md` | A01 / A02 / A04 / A07 categories upgraded to PASS_WITH_WARNINGS or PASS where the only failing finding was a now-closed cycle-1 entry | +| `_docs/05_security/dependency_scan.md` | (already updated during the audit) D-1 marked RESOLVED | + +### Verification + +- `dotnet build Azaion.AdminApi/Azaion.AdminApi.csproj` — green, 0 warnings. +- `dotnet test Azaion.Test/Azaion.Test.csproj` — 2/2 passed. +- `./scripts/run-tests.sh` (e2e) — 44/44 passed (down from 48/48; the 4 deleted `ResourceUpdateTests` are accounted for). + +### Follow-up tickets filed in Jira + +| Ticket | Title | Points | +|--------|-------|--------| +| [AZ-516](https://denyspopov.atlassian.net/browse/AZ-516) | F-2: Sanitize `dataFolder` route segment to prevent path traversal | 3 | +| [AZ-517](https://denyspopov.atlassian.net/browse/AZ-517) | F-4: Harden `/devices` response (Cache-Control, runbook) | 2 | +| [AZ-518](https://denyspopov.atlassian.net/browse/AZ-518) | F-6: Run admin API container as non-root | 2 | +| [AZ-519](https://denyspopov.atlassian.net/browse/AZ-519) | F-7: Migrate password hashing to Argon2id with per-user salt | 5 | +| [AZ-520](https://denyspopov.atlassian.net/browse/AZ-520) | F-8: Add rate limiting to `/login` endpoint | 2 | +| [AZ-521](https://denyspopov.atlassian.net/browse/AZ-521) | Low-severity security hygiene bundle (F-9, F-11, F-12, F-13) | 3 | + +A revert comment was added to AZ-183 (the OTA task that was deleted as part of the F-1 fix). diff --git a/_docs/02_document/system-flows.md b/_docs/02_document/system-flows.md index 0a5025f..f39b016 100644 --- a/_docs/02_document/system-flows.md +++ b/_docs/02_document/system-flows.md @@ -1,5 +1,7 @@ # Azaion Admin API — System Flows +> **Cycle 1 (2026-05-13) note** — F4 (Hardware Check) was deleted by AZ-197; F3 no longer depends on hardware. Two new flows were added: F8 Detection Classes CRUD (AZ-513), F9 Device Auto-Provisioning (AZ-196). F10 OTA Update Check & Publish (AZ-183) was reverted later the same day after the security audit (finding F-1) — the OTA delivery model itself was deemed obsolete; see `_docs/05_security/security_report.md` for context. F3's narrative was updated to drop the hardware-check step. + ## Flow Inventory | # | Flow Name | Trigger | Primary Components | Criticality | @@ -7,22 +9,26 @@ | F1 | User Login | POST /login | Admin API, User Mgmt, Auth & Security | High | | F2 | User Registration | POST /users | Admin API, User Mgmt | High | | F3 | Encrypted Resource Download | POST /resources/get | Admin API, Auth, User Mgmt, Resource Mgmt | High | -| F4 | Hardware Check | POST /resources/check | Admin API, Auth, User Mgmt | High | +| ~~F4~~ | ~~Hardware Check~~ | ~~POST /resources/check~~ | — | **REMOVED — AZ-197** | | F5 | Resource Upload | POST /resources | Admin API, Resource Mgmt | Medium | | F6 | Installer Download | GET /resources/get-installer | Admin API, Auth, Resource Mgmt | Medium | | F7 | User Management (CRUD) | Various /users/* | Admin API, User Mgmt | Medium | +| F8 | Detection Classes CRUD *(AZ-513)* | POST/PATCH/DELETE /classes | Admin API, DetectionClassService | High | +| F9 | Device Auto-Provisioning *(AZ-196)* | POST /devices | Admin API, User Mgmt | High | +| ~~F10~~ | ~~OTA Update Check & Publish~~ | ~~POST /get-update + POST /resources/publish~~ | — | **REMOVED — post-cycle-1 (AZ-183 reverted, see security audit F-1)** | ## Flow Dependencies | Flow | Depends On | Shares Data With | |------|-----------|-----------------| | F1 | — | All other flows (produces JWT token) | -| F2 | — | F1, F3, F4 (creates user records) | -| F3 | F1 (requires JWT), F4 (hardware must be bound) | F4 (via hardware hash) | -| F4 | F1 (requires JWT) | F3 (hardware binding) | +| F2 | — | F1, F9 (creates user records — including device users via F9) | +| F3 | F1 (requires JWT) | — (post-AZ-197: no hardware-binding dependency) | | F5 | F1 (requires JWT) | F3 (uploaded resources are later downloaded) | | F6 | F1 (requires JWT) | — | -| F7 | F1 (requires JWT, ApiAdmin role) | F3, F4 (user data) | +| F7 | F1 (requires JWT, ApiAdmin role) | F3 (user data) | +| F8 | F1 (requires JWT, ApiAdmin role) | UI Detection Classes table | +| F9 | F1 (requires JWT, ApiAdmin role) | F2 (writes a user row, but reuses `RegisterUser` end-to-end), F1 (provisioned devices later log in) | --- @@ -108,12 +114,13 @@ sequenceDiagram ## Flow F3: Encrypted Resource Download +> **Updated by AZ-197 (2026-05-13)** — the hardware-binding precondition and the `CheckHardwareHash` / `GetHWHash` steps were removed; the encryption key is now derived from `email + password` only. The diagram below reflects the post-cycle-1 path. + ### Description -An authenticated user requests a resource file. The system validates hardware binding, derives a per-user encryption key, encrypts the file with AES-256-CBC, and streams the encrypted content. +An authenticated user requests a resource file. The system derives a per-user encryption key from email + password, encrypts the file with AES-256-CBC, and streams the encrypted content. ### Preconditions - User is authenticated (JWT) -- User's hardware is bound (via prior F4 call) - Resource file exists on server ### Sequence Diagram @@ -123,20 +130,15 @@ sequenceDiagram participant Client participant API as Admin API participant Auth as AuthService - participant US as UserService participant Sec as Security participant RS as ResourcesService participant FS as Filesystem - Client->>API: POST /resources/get {password, hardware, fileName} + Client->>API: POST /resources/get {password, fileName} API->>Auth: GetCurrentUser() Auth-->>API: User - API->>US: CheckHardwareHash(user, hardware) - US->>Sec: GetHWHash(hardware) - Sec-->>US: hash - US-->>API: hwHash - API->>Sec: GetApiEncryptionKey(email, password, hwHash) - Sec-->>API: AES key + API->>Sec: GetApiEncryptionKey(email, password) + Sec-->>API: AES key string API->>RS: GetEncryptedResource(folder, fileName, key) RS->>FS: Read file FS-->>RS: FileStream @@ -151,48 +153,15 @@ sequenceDiagram | Error | Where | Detection | Recovery | |-------|-------|-----------|----------| | Not authenticated | API | No/invalid JWT | 401 Unauthorized | -| Hardware mismatch | UserService.CheckHardwareHash | Hash comparison fails | 409: HardwareIdMismatch (code 40) | | File not found | ResourcesService | FileStream throws | 500 Internal Server Error | --- -## Flow F4: Hardware Check (First Login / Validation) +## Flow F4: Hardware Check (REMOVED by AZ-197) -### Description -Client submits its hardware fingerprint. On first call, the hardware is stored for the user. On subsequent calls, the stored hash is compared against the provided hardware. +The hardware-fingerprint binding flow (`POST /resources/check`, `UserService.CheckHardwareHash`, `Security.GetHWHash`, error code 40 `HardwareIdMismatch`, error code 45 `BadHardware`) was removed entirely in cycle 1. -### Preconditions -- User is authenticated (JWT) - -### Sequence Diagram - -```mermaid -sequenceDiagram - participant Client - participant API as Admin API - participant Auth as AuthService - participant US as UserService - participant DB as PostgreSQL - - Client->>API: POST /resources/check {hardware} - API->>Auth: GetCurrentUser() - Auth-->>API: User - API->>US: CheckHardwareHash(user, hardware) - alt First time (no stored hardware) - US->>DB: UPDATE user SET hardware = ? (admin conn) - US->>DB: UPDATE user SET last_login = now() - US-->>API: hwHash - else Hardware already bound - US->>US: Compare hashes - alt Match - US->>DB: UPDATE user SET last_login = now() - US-->>API: hwHash - else Mismatch - US-->>API: throw HardwareIdMismatch - end - end - API-->>Client: 200 OK (true) / 409 -``` +Reason: the threat the binding mitigated (credential reuse via desktop installers) was eliminated by the architectural shift to fTPM-secured Jetsons + browser-only SaaS access. See `_docs/03_implementation/batch_06_report.md` and the obsolete diagram `diagrams/flows/flow_hardware_check.md`. --- @@ -260,9 +229,126 @@ sequenceDiagram ## Flow F7: User Management (CRUD) ### Description -Admin operations: list users, change role, enable/disable, set hardware, update queue offsets, delete user. +Admin operations: list users, change role, enable/disable, update queue offsets, delete user. (The "set hardware" operation was removed by AZ-197 — see F4.) ### Preconditions - Caller has ApiAdmin role (for most operations) -All operations follow the same pattern: API endpoint → UserService method → DbFactory.RunAdmin → PostgreSQL UPDATE/DELETE. Cache is invalidated for affected user keys after writes. +All operations follow the same pattern: API endpoint → UserService method → DbFactory.RunAdmin → PostgreSQL UPDATE/DELETE. Cache is invalidated for affected user keys after writes (the `UpdateQueueOffsets` path is the only remaining cache-invalidation site post-AZ-197). + +--- + +## Flow F8: Detection Classes CRUD *(AZ-513, 2026-05-13)* + +### Description +ApiAdmin manages the detection-class catalogue exposed to operators in the UI: create new entries, partial-merge edits, delete entries. The UI's existing add/delete affordances start working end-to-end once this flow exists; the in-place edit affordance arrives via UI cycle AZ-512. + +### Preconditions +- Caller has ApiAdmin role (`apiAdminPolicy`) +- `detection_classes` table exists in the admin DB + +### Sequence Diagram + +```mermaid +sequenceDiagram + participant Client + participant API as Admin API + participant VAL as FluentValidation + participant DCS as DetectionClassService + participant DB as PostgreSQL + + Client->>API: POST /classes {name, shortName, color, maxSizeM, photoMode?} + API->>VAL: Validate CreateDetectionClassRequest + VAL-->>API: OK / 400 + API->>DCS: Create(request) + DCS->>DB: InsertWithInt32IdentityAsync (admin conn) + DB-->>DCS: new id + DCS-->>API: DetectionClass {id, …} + API-->>Client: 200 OK {DetectionClass} + + Client->>API: PATCH /classes/{id} {…partial fields} + API->>VAL: Validate UpdateDetectionClassRequest + VAL-->>API: OK / 400 + API->>DCS: Update(id, request) + alt id exists + DCS->>DB: UPDATE row applying non-null fields (admin conn) + DCS-->>API: DetectionClass + API-->>Client: 200 OK {DetectionClass} + else id missing + DCS-->>API: null + API-->>Client: 404 Not Found + end + + Client->>API: DELETE /classes/{id} + API->>DCS: Delete(id) + DCS->>DB: DELETE WHERE id = ? (admin conn) + alt deleted > 0 + DCS-->>API: true + API-->>Client: 204 No Content + else + DCS-->>API: false + API-->>Client: 404 Not Found + end +``` + +### Error Scenarios + +| Error | Where | Detection | Recovery | +|-------|-------|-----------|----------| +| Not authenticated | API | No JWT | 401 Unauthorized | +| Wrong role | API | Non-ApiAdmin JWT | 403 Forbidden | +| Validation failure | FluentValidation | Field bounds violated | 400 Bad Request | +| Missing id (PATCH/DELETE) | DetectionClassService | Row not found | 404 Not Found | + +--- + +## Flow F9: Device Auto-Provisioning *(AZ-196, 2026-05-13)* + +### Description +ApiAdmin requests a fresh CompanionPC device user. The server allocates the next sequential serial (`azj-NNNN`), generates a 32-char hex password, persists the user with the SHA-384 hash, and returns the plaintext credentials exactly once. The provisioning script (out-of-tree) embeds the values into the device's `device.conf`. + +### Preconditions +- Caller has ApiAdmin role (`apiAdminPolicy`) + +### Sequence Diagram + +```mermaid +sequenceDiagram + participant Admin + participant API as Admin API + participant US as UserService + participant DB as PostgreSQL + + Admin->>API: POST /devices (no body) + API->>US: RegisterDevice() + US->>DB: SELECT TOP 1 email FROM users WHERE role = 'CompanionPC' ORDER BY created_at DESC + DB-->>US: lastEmail (or null) + US->>US: nextNumber = parse(lastEmail.suffix) + 1 (or 0) + US->>US: serial = "azj-" + nextNumber.PadLeft(4) + US->>US: password = ToHex(RandomBytes(16)) // 32 hex chars + US->>DB: INSERT user {Email=serial@domain, PasswordHash=SHA384(password), Role=CompanionPC, IsEnabled=true} (admin conn) + DB-->>US: OK + US-->>API: RegisterDeviceResponse {Serial, Email, Password} + API-->>Admin: 200 OK {Serial, Email, Password} +``` + +### Error Scenarios + +| Error | Where | Detection | Recovery | +|-------|-------|-----------|----------| +| Not authenticated / wrong role | API | JWT missing or non-ApiAdmin | 401 / 403 | +| Email already exists | UserService.RegisterUser (called by RegisterDevice) | DB UNIQUE INDEX `users_email_uidx` violation translated to `EmailExists` (5) | 409 — caller retries (the next call recomputes a fresh `azj-NNNN`) | + +> **Implementation note** — `RegisterDevice` reuses `UserService.RegisterUser` for the row insert (post-security-audit consolidation, finding F-3). The `users.email` column has a UNIQUE INDEX (`env/db/06_users_email_unique.sql`); concurrent provisioning calls that race on the same serial surface the violation atomically. + +--- + +## Flow F10: OTA Update Check & Publish *(REMOVED — post-cycle-1 revert)* + +The `POST /get-update` and `POST /resources/publish` endpoints, the `IResourceUpdateService` / `ResourceUpdateService` / `ResourceColumnEncryption` types, the `Resource` entity, the `resources` table, the `apiUploaderPolicy`, and the `ResourcesConfig.EncryptionMasterKey` field were all removed shortly after AZ-183 shipped. + +Reasons: +1. Security audit finding F-1 — `/get-update` was registered with `.RequireAuthorization()` (any authenticated caller) and returned the per-resource decrypted `EncryptionKey` in the response body, defeating the at-rest column encryption. +2. The OTA delivery model is itself a leftover from the installer-shipping era; the target architecture (browser-only SaaS + fTPM-secured Jetsons) does not need it. + +The `apiUploaderPolicy` definition was removed from `Program.cs`; the `RoleEnum.ResourceUploader` enum value remains as data (the seed `uploader@azaion.com` user still uses it for negative-auth tests) but is no longer wired to any endpoint. diff --git a/_docs/02_document/tests/blackbox-tests.md b/_docs/02_document/tests/blackbox-tests.md index 56dfc78..1249538 100644 --- a/_docs/02_document/tests/blackbox-tests.md +++ b/_docs/02_document/tests/blackbox-tests.md @@ -473,3 +473,349 @@ **Expected outcome**: HTTP 400 with password length validation error **Max execution time**: 5s + +--- + +## Cycle 1 Additions (2026-05-13) + +The scenarios below were appended during the existing-code cycle 1 Test-Spec Sync (autodev Step 12) for tasks AZ-513, AZ-196, AZ-183, AZ-197. Numbering continues from the legacy IDs above; existing IDs are preserved. + +### Cycle 1 Obsoletion Note + +The following legacy entries describe behaviour removed by AZ-197 (admin-side hardware-binding cleanup). Their bodies are intentionally left intact to preserve traceability IDs per the cycle-update rule "preserve existing traceability IDs"; they should be treated as obsolete and superseded by FT-N-15 below: + +- FT-P-04 (First Hardware Check Stores Fingerprint) — superseded; the `POST /resources/check` endpoint and the hardware-store side-effect were removed. +- FT-P-05 (Subsequent Hardware Check Matches) — superseded; same endpoint removed. +- FT-N-06 (Hardware Mismatch) — superseded; the `HardwareIdMismatch` / error code 40 path no longer exists in `ExceptionEnum`. +- FT-P-09 / FT-P-10 wire shape — the `hardware` field on `POST /resources/get/{dataFolder}` is no longer required; the encryption key is now derived from `email + password` only. The tests still pass without the field; do not regenerate spec bodies until a full `/test-spec` rerun. + +See `_docs/03_implementation/batch_06_report.md` for the full AZ-197 implementation rationale and the wire-compat policy decision (drop entirely). + +--- + +### Detection Classes CRUD (AZ-513) + +#### FT-P-14: POST /classes Creates Detection Class + +**Summary**: ApiAdmin creates a new detection class and the response includes the assigned id. +**Traces to**: AZ-513 AC-1 +**Category**: Detection Classes CRUD + +**Preconditions**: +- Caller authenticated as ApiAdmin +- `detection_classes` table exists + +**Input data**: `{"name":"Tank","shortName":"T","color":"#FF0000","maxSizeM":5.0}` + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /classes with valid body and ApiAdmin JWT | HTTP 200/201 with body containing assigned `id` and the submitted fields | + +**Expected outcome**: HTTP 200 or 201, response body has integer `id` and matches input fields +**Max execution time**: 5s + +--- + +#### FT-P-15: PATCH /classes/{id} Full Body Update + +**Summary**: Updating a detection class with a full body replaces the changed fields. +**Traces to**: AZ-513 AC-3 +**Category**: Detection Classes CRUD + +**Preconditions**: +- A detection class with id `7` exists with `name: "Tank"` + +**Input data**: `{"name":"Heavy Tank","shortName":"T","color":"#FF0000","maxSizeM":5.0}` to PATCH /classes/7 + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | PATCH /classes/7 with full body and ApiAdmin JWT | HTTP 200, response body shows `name: "Heavy Tank"` | + +**Expected outcome**: HTTP 200, updated entity reflects the changed field +**Max execution time**: 5s + +--- + +#### FT-P-16: PATCH /classes/{id} Partial Body Update + +**Summary**: PATCH with only the changed field updates that field and leaves others intact. +**Traces to**: AZ-513 AC-4 +**Category**: Detection Classes CRUD + +**Preconditions**: +- A detection class with id `7` exists with `name: "Tank", color: "#FF0000", maxSizeM: 5.0` + +**Input data**: `{"color":"#00FF00"}` to PATCH /classes/7 + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | PATCH /classes/7 with partial body and ApiAdmin JWT | HTTP 200, response body shows `color: "#00FF00"`; other fields unchanged | + +**Expected outcome**: HTTP 200, partial-merge semantics confirmed +**Max execution time**: 5s + +--- + +#### FT-P-17: DELETE /classes/{id} Removes Class + +**Summary**: ApiAdmin deletes a detection class and it disappears from the DB. +**Traces to**: AZ-513 AC-7 +**Category**: Detection Classes CRUD + +**Preconditions**: +- A detection class with id `7` exists + +**Input data**: DELETE /classes/7 + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | DELETE /classes/7 with ApiAdmin JWT | HTTP 200 or 204 | +| 2 | GET the class list (or PATCH the same id) | id 7 no longer present | + +**Expected outcome**: HTTP 200/204; class removed from DB +**Max execution time**: 5s + +--- + +#### FT-N-09: POST /classes Without ApiAdmin JWT + +**Summary**: POST /classes requires the same `apiAdminPolicy` as `/users`; non-admin / unauthenticated calls are rejected. +**Traces to**: AZ-513 AC-2 +**Category**: Detection Classes CRUD + +**Preconditions**: None (negative path) + +**Input data**: Valid body, but caller has no JWT or a non-ApiAdmin JWT + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /classes without JWT | HTTP 401 | +| 2 | POST /classes with non-ApiAdmin JWT | HTTP 403 | + +**Expected outcome**: HTTP 401 (no JWT) or 403 (non-admin) +**Max execution time**: 5s + +--- + +#### FT-N-10: PATCH /classes/{id} Unknown id Returns 404 + +**Summary**: PATCH against a non-existent id returns 404. +**Traces to**: AZ-513 AC-5 +**Category**: Detection Classes CRUD + +**Preconditions**: No detection class with id `9999` + +**Input data**: PATCH /classes/9999 with any valid body + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | PATCH /classes/9999 with ApiAdmin JWT | HTTP 404 | + +**Expected outcome**: HTTP 404 +**Max execution time**: 5s + +--- + +#### FT-N-11: PATCH /classes/{id} Without ApiAdmin JWT + +**Summary**: PATCH /classes/{id} requires `apiAdminPolicy`. +**Traces to**: AZ-513 AC-6 +**Category**: Detection Classes CRUD + +**Input data**: Any valid body to PATCH /classes/{id} + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | PATCH /classes/{id} without JWT | HTTP 401 | +| 2 | PATCH /classes/{id} with non-ApiAdmin JWT | HTTP 403 | + +**Expected outcome**: HTTP 401 or 403 +**Max execution time**: 5s + +--- + +#### FT-N-12: DELETE /classes/{id} Unknown id Returns 404 + +**Summary**: DELETE against a non-existent id returns 404 (matching `/users` semantics — non-idempotent). +**Traces to**: AZ-513 AC-8 +**Category**: Detection Classes CRUD + +**Preconditions**: No detection class with id `9999` + +**Input data**: DELETE /classes/9999 + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | DELETE /classes/9999 with ApiAdmin JWT | HTTP 404 | + +**Expected outcome**: HTTP 404 +**Max execution time**: 5s + +--- + +#### FT-N-13: DELETE /classes/{id} Without ApiAdmin JWT + +**Summary**: DELETE /classes/{id} requires `apiAdminPolicy`. +**Traces to**: AZ-513 AC-9 +**Category**: Detection Classes CRUD + +**Input data**: DELETE /classes/{id} + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | DELETE /classes/{id} without JWT | HTTP 401 | +| 2 | DELETE /classes/{id} with non-ApiAdmin JWT | HTTP 403 | + +**Expected outcome**: HTTP 401 or 403 +**Max execution time**: 5s + +--- + +### Device Auto-Registration (AZ-196) + +#### FT-P-18: POST /devices Returns Serial / Email / Password + +**Summary**: First call to POST /devices returns the next serial in the `azj-NNNN` sequence with a generated email and 32-char hex password. +**Traces to**: AZ-196 AC-1 +**Category**: Device Provisioning + +**Preconditions**: +- Caller authenticated as ApiAdmin +- No (or known-prior) CompanionPC users in DB + +**Input data**: POST /devices with no body, ApiAdmin JWT + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /devices with ApiAdmin JWT | HTTP 200 with `serial` matching `^azj-\d{4}$`, `email` = `{serial}@azaion.com`, `password` = 32 lowercase hex chars | + +**Expected outcome**: HTTP 200, all three fields shaped per spec +**Max execution time**: 5s + +--- + +#### FT-P-19: Sequential Device Serials + +**Summary**: Repeated calls to POST /devices yield strictly increasing serial numbers. +**Traces to**: AZ-196 AC-2 +**Category**: Device Provisioning + +**Preconditions**: +- Most recent CompanionPC user has a known serial `azj-NNNN` + +**Input data**: POST /devices twice in succession + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /devices → record serial `S1` | HTTP 200 | +| 2 | POST /devices → record serial `S2` | HTTP 200 | +| 3 | Parse the numeric suffix of both | numeric(S2) == numeric(S1) + 1 | + +**Expected outcome**: HTTP 200, suffix increments by exactly 1 +**Max execution time**: 5s + +--- + +#### FT-P-20: Returned Device Credentials Can Login + +**Summary**: The plaintext password returned by POST /devices succeeds against POST /login (and the persisted hash is therefore correct). +**Traces to**: AZ-196 AC-3, AZ-196 AC-4 +**Category**: Device Provisioning + +**Preconditions**: +- Caller authenticated as ApiAdmin + +**Input data**: Use the response from POST /devices as `{Email, Password}` to POST /login + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /devices with ApiAdmin JWT | HTTP 200, `{Serial, Email, Password}` returned | +| 2 | POST /login with the returned `Email` and `Password` | HTTP 200 with non-empty JWT | + +**Expected outcome**: HTTP 200 on login; persisted user has Role=CompanionPC, IsEnabled=true (verified by AdminApi behaviour rather than direct DB inspection) +**Max execution time**: 5s + +--- + +#### FT-N-14: POST /devices Without ApiAdmin JWT + +**Summary**: POST /devices requires `apiAdminPolicy`. +**Traces to**: AZ-196 AC-5 +**Category**: Device Provisioning + +**Input data**: POST /devices + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | POST /devices without JWT | HTTP 401 | +| 2 | POST /devices with non-ApiAdmin JWT | HTTP 403 | + +**Expected outcome**: HTTP 401 or 403 +**Max execution time**: 5s + +--- + +### Resources OTA Update Check (AZ-183) — REVERTED post-cycle-1 + +The OTA update check & publish feature shipped in cycle 1 was reverted later the same day after the security audit (finding F-1: `/get-update` disclosed plaintext per-resource encryption keys to any authenticated caller). The OTA delivery model itself was deemed obsolete in the target architecture. + +The scenarios `FT-P-21`, `FT-P-22`, `FT-P-23` are retained here as ID placeholders so previously-cited references resolve. Their bodies are intentionally collapsed because the underlying endpoints, service, entity, table, and the e2e test class `ResourceUpdateTests.cs` were all removed. See `_docs/02_document/system-flows.md` (Flow F10) and `_docs/05_security/security_report.md` (finding F-1) for context. + +| Removed Test ID | Was tracing | Disposition | +|-----------------|-------------|-------------| +| FT-P-21 | AZ-183 AC-2 | Removed — endpoint and test deleted | +| FT-P-22 | AZ-183 AC-3 | Removed — endpoint and test deleted | +| FT-P-23 | AZ-183 AC-5 | Removed — endpoint and test deleted | + +--- + +### Hardware-Binding Removal (AZ-197) + +#### FT-N-15: Hardware Endpoints Removed + +**Summary**: The legacy `PUT /users/hardware/set` endpoint and the `POST /resources/check` endpoint have been removed and now return 404. +**Traces to**: AZ-197 AC-2 +**Category**: Authorization & Routing + +**Preconditions**: +- Updated admin API build (post-AZ-197) + +**Input data**: PUT /users/hardware/set and POST /resources/check + +**Steps**: + +| Step | Consumer Action | Expected System Response | +|------|----------------|------------------------| +| 1 | PUT /users/hardware/set with ApiAdmin JWT | HTTP 404 | +| 2 | POST /resources/check with ApiAdmin JWT | HTTP 404 | + +**Expected outcome**: HTTP 404 on both routes +**Max execution time**: 5s + +Note: AZ-197 AC-1 (resource download works without `Hardware`) is implicitly covered by the existing FT-P-09 / FT-P-10 scenarios once their request bodies are aligned with the new wire shape. AZ-197 AC-3..AC-8 are internal-signature / build-system invariants and are verified at build/CI time, not via a blackbox HTTP scenario. diff --git a/_docs/02_document/tests/performance-tests.md b/_docs/02_document/tests/performance-tests.md index 34144cb..5a1f79d 100644 --- a/_docs/02_document/tests/performance-tests.md +++ b/_docs/02_document/tests/performance-tests.md @@ -1,5 +1,7 @@ # Performance Tests +> **Cycle 1 update (2026-05-13)**: NFT-PERF-02 and NFT-PERF-03 (encrypted resource download, small/large file) were removed because the OTA / encrypted-resource-download endpoints (`POST /resources/get/...`) and the hardware-binding flow they depended on were reverted in cycle 1 (AZ-183 OTA revert, AZ-197 hardware removal). When OTA returns under the new architecture, perf scenarios for it must be re-derived from the new endpoints. + ### NFT-PERF-01: Login Endpoint Latency **Summary**: Login endpoint responds within acceptable latency under normal load. @@ -7,77 +9,46 @@ **Metric**: Response time (p95) **Preconditions**: -- System running with seed data -- 10 concurrent users +- System running with seed data (admin user from `e2e/db-init/99_test_seed.sql`) +- 10 concurrent virtual users **Steps**: | Step | Consumer Action | Measurement | |------|----------------|-------------| -| 1 | Send 100 login requests (10 concurrent) | Measure p50, p95, p99 response times | +| 1 | 10 VUs send POST /login for 30s | Measure p50, p95, p99 response times | **Pass criteria**: p95 latency < 500ms **Duration**: 30 seconds --- -### NFT-PERF-02: Resource Download Latency (Small File) - -**Summary**: Encrypted resource download for a small file (1 KB) completes quickly. -**Traces to**: AC-14 -**Metric**: Response time including encryption - -**Preconditions**: -- 1 KB test file uploaded -- User authenticated with bound hardware - -**Steps**: - -| Step | Consumer Action | Measurement | -|------|----------------|-------------| -| 1 | Send 50 encrypted download requests (5 concurrent) | Measure p50, p95 response times | - -**Pass criteria**: p95 latency < 1000ms -**Duration**: 30 seconds - ---- - -### NFT-PERF-03: Resource Download Latency (Large File) - -**Summary**: Encrypted resource download for a larger file (50 MB) completes within limits. -**Traces to**: AC-13, AC-14 -**Metric**: Response time including encryption + transfer - -**Preconditions**: -- 50 MB test file uploaded -- User authenticated with bound hardware - -**Steps**: - -| Step | Consumer Action | Measurement | -|------|----------------|-------------| -| 1 | Send 5 sequential encrypted download requests | Measure p50, p95 response times | - -**Pass criteria**: p95 latency < 30000ms (30s) -**Duration**: 3 minutes - ---- - ### NFT-PERF-04: User List Endpoint Under Load -**Summary**: User list endpoint responds within limits when DB has many users. -**Traces to**: AC-9 -**Metric**: Response time +**Summary**: `GET /users` responds within limits when DB has many users. +**Traces to**: AC-11 + +> **Note**: this scenario originally referenced AC-9. Post-cycle-1, AC-9 is "Registration rejects duplicate email". The user-listing criterion is AC-11 (filter support). The thresholds below verify the listing path under volume; the filter semantics are covered by functional tests. + +**Metric**: Response time (p95) **Preconditions**: -- 500 users in database -- Caller is ApiAdmin +- Database seeded with 500 users (perf seed inserts dummy rows alongside the functional seed; see `scripts/run-performance-tests.sh`) +- Caller is `admin@azaion.com` (ApiAdmin) **Steps**: | Step | Consumer Action | Measurement | |------|----------------|-------------| -| 1 | Send 50 GET /users requests (10 concurrent) | Measure p50, p95 response times | +| 1 | 10 VUs send GET /users for 30s, sharing one cached JWT | Measure p50, p95, p99 response times | **Pass criteria**: p95 latency < 1000ms **Duration**: 30 seconds + +--- + +## Runner + +Both scenarios are implemented in `scripts/perf-scenarios.js` (k6, JS) and orchestrated by `scripts/run-performance-tests.sh`. The runner spins up the SUT via `docker-compose.test.yml`, seeds 500 perf users into `test-db`, executes k6, captures the JSON summary to `e2e/test-results/perf-summary.json`, and tears down. + +To run locally: `./scripts/run-performance-tests.sh`. Requires `k6` (`brew install k6`) and Docker. diff --git a/_docs/02_document/tests/traceability-matrix.md b/_docs/02_document/tests/traceability-matrix.md index d6100a8..8f9ef79 100644 --- a/_docs/02_document/tests/traceability-matrix.md +++ b/_docs/02_document/tests/traceability-matrix.md @@ -41,9 +41,10 @@ | Category | Total Items | Covered | Not Covered | Coverage % | |----------|-----------|---------|-------------|-----------| -| Acceptance Criteria | 19 | 19 | 0 | 100% | +| Acceptance Criteria (baseline) | 19 | 19 | 0 | 100% | +| Acceptance Criteria (cycle 1) | 24 | 24 | 0 | 100% | | Restrictions | 8 | 5 | 3 | 63% | -| **Total** | **27** | **24** | **3** | **89%** | +| **Total** | **51** | **48** | **3** | **94%** | ## Uncovered Items Analysis @@ -52,3 +53,68 @@ | RESTRICT-HW-01 (ARM64) | Tests run on x64 dev/CI host; cross-architecture testing requires ARM hardware | Low — .NET runtime handles arch differences; no arch-specific code in application | CI builds ARM64 image; manual smoke test on target device | | RESTRICT-ENV-02 (CORS) | CORS is enforced by browsers, not by server-to-server HTTP calls | Low — CORS policy is declarative in Program.cs | Visual inspection of CORS configuration in code | | RESTRICT-OP-01 (Logging) | Log output format/content verification adds complexity without proportional value | Low — Serilog configuration is declarative | Code review of Serilog setup | + +## Cycle 1 Additions (2026-05-13) — AZ-513, AZ-196, AZ-183, AZ-197 + +Appended during the existing-code cycle 1 Test-Spec Sync (autodev Step 12). Cycle 1 ACs are namespaced by their tracker ID to avoid colliding with the baseline AC-1..AC-19 numbering above. + +### AZ-513 — Detection Classes CRUD + +| AC ID | Acceptance Criterion | Test IDs | Coverage | +|-------|---------------------|----------|----------| +| AZ-513 AC-1 | POST /classes creates a class | FT-P-14 | Covered | +| AZ-513 AC-2 | POST /classes requires ApiAdmin authorization | FT-N-09 | Covered | +| AZ-513 AC-3 | PATCH /classes/{id} updates an existing class (full body) | FT-P-15 | Covered | +| AZ-513 AC-4 | PATCH /classes/{id} accepts partial body (partial-merge) | FT-P-16 | Covered | +| AZ-513 AC-5 | PATCH /classes/{id} returns 404 for unknown id | FT-N-10 | Covered | +| AZ-513 AC-6 | PATCH /classes/{id} requires ApiAdmin authorization | FT-N-11 | Covered | +| AZ-513 AC-7 | DELETE /classes/{id} removes a class | FT-P-17 | Covered | +| AZ-513 AC-8 | DELETE /classes/{id} returns 404 for unknown id | FT-N-12 | Covered | +| AZ-513 AC-9 | DELETE /classes/{id} requires ApiAdmin authorization | FT-N-13 | Covered | +| AZ-513 AC-10 | UI add/delete/edit affordances work end-to-end | — | Cross-workspace (ui/ e2e harness) — out of scope for this workspace | + +### AZ-196 — Device Auto-Registration + +| AC ID | Acceptance Criterion | Test IDs | Coverage | +|-------|---------------------|----------|----------| +| AZ-196 AC-1 | First device gets serial azj-0000 (shape: serial / email / 32-hex password) | FT-P-18 | Covered | +| AZ-196 AC-2 | Sequential numbering on subsequent calls | FT-P-19 | Covered | +| AZ-196 AC-3 | Persisted user has Role=CompanionPC, IsEnabled=true | FT-P-20 | Covered (verified via successful login → role-gated behaviour) | +| AZ-196 AC-4 | Returned plaintext password is hashed (SHA-384) in DB, not stored plaintext | FT-P-20 | Covered (verified via successful login round-trip) | +| AZ-196 AC-5 | Requires ApiAdmin authorization | FT-N-14 | Covered | + +### AZ-183 — Resources OTA Update Check (REVERTED post-cycle-1) + +The OTA Update Check & Publish feature shipped in cycle 1 was reverted later the same day after the security audit (finding F-1: `/get-update` disclosed plaintext per-resource encryption keys to any authenticated caller; the OTA delivery model itself was deemed obsolete in the target architecture). The endpoints, service, entity, table, request DTOs, response DTO, cache key, master-key config field, and the e2e test class `ResourceUpdateTests` were all removed. + +| AC ID | Acceptance Criterion | Test IDs | Status | +|-------|---------------------|----------|--------| +| AZ-183 AC-1 | Resources table created with required columns | — | **Reverted** — table dropped from migration set (`env/db/05_resources.sql` deleted) | +| AZ-183 AC-2 | POST /get-update returns newer resources | ~~FT-P-21~~ | **Reverted** — endpoint and test deleted | +| AZ-183 AC-3 | POST /get-update returns empty when device already current | ~~FT-P-22~~ | **Reverted** — endpoint and test deleted | +| AZ-183 AC-4 | Memory cache avoids DB pressure under 2000-device polling | — | **Reverted** — cache key removed | +| AZ-183 AC-5 | Cache invalidated on CI/CD publish | ~~FT-P-23~~ | **Reverted** — endpoint and test deleted | + +### AZ-197 — Hardware-Binding Removal + +| AC ID | Acceptance Criterion | Test IDs | Coverage | +|-------|---------------------|----------|----------| +| AZ-197 AC-1 | Resource download works without `Hardware` field | FT-P-09 / FT-P-10 (legacy bodies retained; wire shape now omits the field) | Covered (e2e `ResourceTests` updated by AZ-197 batch 6) | +| AZ-197 AC-2 | `PUT /users/hardware/set` and `POST /resources/check` return 404 | FT-N-15 | Covered | +| AZ-197 AC-3 | `Security.GetApiEncryptionKey` signature simplified to (email, password) | — | Internal signature — covered by `Azaion.Test/SecurityTest` unit tests, not blackbox | +| AZ-197 AC-4 | `HardwareBindingTests` removed; no remaining test asserts code 40 / hardware-hash binding | — | Build/CI invariant — verified by test-suite enumeration | +| AZ-197 AC-5 | Resource calls in remaining tests do not send `Hardware` | — | Build/CI invariant — verified by source review during AZ-197 batch 6 | +| AZ-197 AC-6 | `ExceptionEnum` no longer carries `HardwareIdMismatch` / `BadHardware` | — | Build/CI invariant — verified by enum read | +| AZ-197 AC-7 | `dotnet build` is clean (no new warnings) | — | Build invariant | +| AZ-197 AC-8 | Test suite passes (excluding deleted `HardwareBindingTests`) | All e2e tests + `Azaion.Test` | Covered by Step 11 Run Tests (48/48 e2e + 2/2 unit, 2026-05-13) | + +### Obsoleted Baseline Entries (superseded by AZ-197) + +The matrix rows below are kept for ID stability but no longer reflect production behaviour. They are superseded by the AZ-197 entries above and by FT-N-15 in `blackbox-tests.md`. Do NOT regenerate or delete these in cycle-update mode — wait for a full `/test-spec` rerun. + +| Legacy Matrix Row | Status | +|-------------------|--------| +| AC-10 (First hardware check stores) | Obsoleted by AZ-197 — endpoint removed | +| AC-11 (Subsequent hardware check validates) | Obsoleted by AZ-197 — endpoint removed | +| AC-12 (Hardware mismatch returns code 40) | Obsoleted by AZ-197 — `ExceptionEnum` value removed | +| AC-19 (Encryption key derived from email+password+hw) | Partially obsoleted — derivation is now `email + password` only | diff --git a/_docs/04_deploy/ci_cd_pipeline.md b/_docs/04_deploy/ci_cd_pipeline.md new file mode 100644 index 0000000..4558596 --- /dev/null +++ b/_docs/04_deploy/ci_cd_pipeline.md @@ -0,0 +1,158 @@ +# Azaion Admin API — CI/CD Pipeline + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: planning artifact (current Woodpecker files audited; proposed changes land as concrete YAML in Step 7). + +## 1. Platform & Constraints + +| Constraint | Value | Source | +|------------|-------|--------| +| CI platform | **Woodpecker CI** | restrictions.md §Operational | +| Default agent label | `arm64` | `.woodpecker/01-test.yml`, `.woodpecker/02-build-push.yml` | +| Future agent label | `amd64` (matrix entry, currently commented out) | `.woodpecker/02-build-push.yml` | +| Two-workflow contract | `01-test.yml` → tests; `02-build-push.yml` (`depends_on: 01-test`) → image | Already in repo | +| Registry | `$REGISTRY_HOST/azaion/admin` | Woodpecker secret `registry_host` | +| Branches with full pipeline | `dev`, `stage`, `main` | both files' `when.branch` | + +The reference contract from `.cursor/skills/deploy/templates/ci_cd_pipeline.md` is already partially adopted. This step closes the remaining gaps. + +## 2. Current Pipeline (audited) + +### `.woodpecker/01-test.yml` — what it does today + +| Step | Image | Action | Quality gate | +|------|-------|--------|--------------| +| `unit-tests` | `mcr.microsoft.com/dotnet/sdk:10.0` | `dotnet restore` + `dotnet test Azaion.AdminApi.sln` (release, TRX logger) | All unit tests pass | +| `e2e-tests` | `mcr.microsoft.com/dotnet/sdk:10.0` | `dotnet restore` + `dotnet test e2e/Azaion.E2E/Azaion.E2E.csproj` | All E2E tests pass | + +**Audit findings**: + +1. ✅ Tests are gated before build (matches contract). +2. ❌ E2E test step runs `dotnet test` directly — but the project uses **Docker-orchestrated black-box tests** via `docker-compose.test.yml`. The pure `dotnet test` invocation cannot start `system-under-test` + `test-db` containers, so `e2e-tests` as written either skips integration scenarios or relies on undocumented agent state. The reference contract uses `docker compose … --abort-on-container-exit --exit-code-from e2e-runner` instead. +3. ❌ No coverage report. +4. ❌ No SAST / dependency scan / image scan stage. Security audit recommendation 13 explicitly asked for `dotnet list package --vulnerable` in CI (Drift F). +5. ❌ No artifact upload of TRX results — failures are visible only in console logs. + +### `.woodpecker/02-build-push.yml` — what it does today + +| Step | Image | Action | Quality gate | +|------|-------|--------|--------------| +| `build-push` | `docker` | `docker login` → `docker build` (with three OCI labels + `CI_COMMIT_SHA` build-arg) → `docker push $REGISTRY_HOST/azaion/admin:${CI_COMMIT_BRANCH}-${TAG_SUFFIX}` | Push succeeds | + +**Audit findings**: + +1. ✅ Multi-arch matrix scaffolding present (`PLATFORM` / `TAG_SUFFIX`) with amd64 commented for future use. +2. ✅ `depends_on: [01-test]` — gating is correct. +3. ✅ OCI labels (`revision`, `created`, `source`) injected as build-time labels. +4. ❌ Only branch-based mutable tag pushed. No immutable `-` tag → host scripts cannot pin (Drift A). +5. ❌ No image scan (Trivy) before push. +6. ❌ Old documentation referenced `.woodpecker/build-arm.yml` which no longer exists (Drift D — fix in this doc, see §10). + +## 3. Proposed Stage Map (target state for cycle 1) + +| Stage | Trigger | Workflow file | Quality gate | +|-------|---------|---------------|--------------| +| Lint / format | every push & PR | `01-test.yml` (new step) | `dotnet format --verify-no-changes` returns 0 | +| Unit tests | every push & PR | `01-test.yml` | All `Azaion.*Tests` pass; TRX uploaded | +| Black-box E2E (Docker compose) | every push & PR | `01-test.yml` | `docker compose -f docker-compose.test.yml up --abort-on-container-exit --exit-code-from e2e-consumer` returns 0; results uploaded | +| Security: dependency audit | every push & PR | `01-test.yml` (new step) | `dotnet list package --vulnerable --include-transitive` reports zero High/Critical CVEs | +| Security: image scan | post-build, pre-push | `02-build-push.yml` (new step) | `trivy image --severity HIGH,CRITICAL --exit-code 1` returns 0 | +| Build | push to `dev` / `stage` / `main` | `02-build-push.yml` | `docker build` succeeds | +| Push (branch tag + SHA tag) | push to `dev` / `stage` / `main` | `02-build-push.yml` | both `docker push` calls succeed | +| Performance smoke (optional) | manual on `stage` / `main` | `03-perf.yml` (new) | k6 thresholds in `scripts/perf-scenarios.js` all `ok: true` | +| Deploy staging | tag push or `stage` branch | `04-deploy.yml` (new) | health check returns 200 within timeout | +| Deploy production | manual approval | `04-deploy.yml` (new) | health check returns 200 within timeout | + +> Note on coverage: the test infrastructure (cycle 1) does not yet collect or report coverage. The skill's 75% gate cannot be enforced this cycle. Recorded as **Drift I** (carried forward to a future cycle); does NOT block this deploy. + +## 4. Caching Strategy + +| Cache | Key | Notes | +|-------|-----|-------| +| `nuget` packages | hash of `**/*.csproj` | Mounted on `/root/.nuget/packages`; restored before `dotnet restore`. Cache invalidates on any csproj change. | +| Docker layer cache | hash of `Dockerfile` + `**/*.csproj` | Use Woodpecker `--cache-from` against the previous push of the same branch (e.g. `--cache-from $REGISTRY_HOST/azaion/admin:dev-arm`). Cheapest cache available without buildx. | +| E2E DB init scripts | none — re-init each run | Schema differences would mask test failures. `down -v` between runs is intentional (mirrors `scripts/run-tests.sh`). | + +## 5. Parallelization + +``` +01-test.yml (matrix: arm64 [+ amd64 future]) +├── lint-format ─┐ +├── unit-tests ─┼── all run in parallel on the same agent; +├── e2e-tests ─┤ the slowest (e2e) gates the workflow +└── deps-audit ─┘ + +02-build-push.yml (matrix: arm64 [+ amd64 future]) +├── build ─→ image-scan ─→ push (branch tag) ─→ push (sha tag) + │ + └─→ artifact: image digest stored as Woodpecker artifact + +03-perf.yml (manual; arm64 only) +└── k6-perf (uses the docker-compose.test.yml SUT) + +04-deploy.yml (manual; per-environment) +└── pull → stop → start → health-check → smoke +``` + +Cross-workflow gates: `02 depends_on 01`; `04 depends_on 02` for the same SHA. + +## 6. Quality Gates (summary) + +| Gate | Threshold | Action on breach | +|------|-----------|------------------| +| Lint | 0 violations | fail workflow | +| Unit tests | 100% pass | fail workflow | +| E2E tests | 100% pass | fail workflow | +| Dependency audit (High / Critical) | 0 CVEs | fail workflow (Drift F) | +| Image scan (High / Critical) | 0 CVEs | fail workflow | +| Coverage | not enforced this cycle (Drift I) | inform-only | +| Performance (k6) | thresholds in `perf-scenarios.js` | fail workflow when run | + +## 7. Notifications + +| Event | Channel | Recipients | +|-------|---------|------------| +| `01-test` failure | Woodpecker UI + Slack `#azaion-ci` | Backend team | +| `02-build-push` failure | Woodpecker UI + Slack `#azaion-ci` | Backend team | +| Image-scan High/Critical finding | Slack `#azaion-security` | Security + on-call | +| `04-deploy` failure | Slack `#azaion-ops` + email on-call | Ops on-call | +| Manual production deploy approval requested | Slack `#azaion-ops` | Approvers | + +> Slack channel names are placeholders — swap to actual channel IDs in Step 7 when wiring `from_secret: slack_webhook_*`. Email/Pager wiring is deferred until those secrets exist. + +## 8. Image Tags + +Resolves Drift A: + +| Push order | Tag | Stability | Used by | +|-----------|-----|-----------|---------| +| 1 | `${CI_COMMIT_BRANCH}-${TAG_SUFFIX}` | mutable (overwritten each push to the branch) | quick dev pulls (`docker pull …:dev-arm`) | +| 2 | `${CI_COMMIT_SHA:0:12}-${TAG_SUFFIX}` | immutable | host deploy scripts; rollback target | + +Production deploys MUST reference the SHA tag, never the branch tag (Step 6 procedures will enforce this). + +## 9. Reproducibility & Audit + +- Every pushed image carries `org.opencontainers.image.revision` = full `CI_COMMIT_SHA`. The 12-char prefix in the tag is for human reading; the label is the source of truth. +- `org.opencontainers.image.created` = ISO-8601 build start time (UTC). +- `org.opencontainers.image.source` = `$CI_REPO_URL`. +- Both image scan and dependency audit reports are uploaded as Woodpecker artifacts on every run (success and failure). + +## 10. Drifts Resolved Here / Carried Forward + +| ID | Severity | Description | Status | +|----|----------|-------------|--------| +| A | Medium | Branch-tag-only push; host pulls `:latest` that CI never produces | **Resolved in spec** — add SHA-tag push (§8); script change in Step 7 | +| D | Low | Old docs referenced `.woodpecker/build-arm.yml` | **Resolved here** — corrected to `01-test.yml` + `02-build-push.yml` everywhere | +| E | Low | `scripts/run-performance-tests.sh` is run-on-demand only | **Spec** — `03-perf.yml` planned; manual trigger in cycle 1, automatic gate in a future cycle when threshold fluctuation is understood | +| F | Low | No vulnerable-dep gate in CI | **Resolved in spec** — `deps-audit` step in `01-test.yml`; concrete YAML in Step 7 | +| I | Low (NEW) | No coverage threshold enforced (no coverage collection wired) | **Carried forward** to a future cycle; recorded in the deploy plan, not blocking | + +## 11. Self-verification + +- [x] All pipeline stages defined with triggers and gates. +- [ ] Coverage threshold enforced — **deferred (Drift I)** with explicit justification. +- [x] Security scanning included (deps + image; SAST deferred to a future cycle when a SAST tool is selected). +- [x] Caching configured (NuGet + Docker layer). +- [x] Multi-environment deployment scaffold (staging → production manual). +- [x] Rollback referenced (SHA-tagged images make `docker run …:-arm` a one-line rollback; details in Step 6). +- [x] Notification matrix defined. diff --git a/_docs/04_deploy/containerization.md b/_docs/04_deploy/containerization.md new file mode 100644 index 0000000..90004d7 --- /dev/null +++ b/_docs/04_deploy/containerization.md @@ -0,0 +1,228 @@ +# Azaion Admin API — Containerization + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: planning artifact (no code changes; Dockerfile updates land in Step 7). + +## 1. Container Inventory + +The system has only **one runtime container**. The four library components are linked into the API at build time, not shipped separately. + +| # | Container | Built from | Purpose | Lifetime | +|---|-----------|------------|---------|----------| +| 1 | `admin-api` | `Dockerfile` (root) | Single ASP.NET Core 10 service exposing all 17 endpoints | Long-running | +| 2 | `e2e-runner` | `e2e/Dockerfile` | Black-box test consumer used by CI and local `docker-compose.test.yml` | One-shot (run-and-exit) | +| 3 | `test-db` | `postgres:16-alpine` (no custom Dockerfile) | Isolated Postgres for tests | One-shot (per CI run) | + +> `docker.test/Dockerfile` is a leftover placeholder (`FROM alpine:latest; CMD echo hello`) and is unused. **Drift G** — recommend deletion in Step 7 (scripts) cleanup. + +## 2. Component → Container Mapping + +| Component | Ships in container? | Notes | +|-----------|--------------------:|-------| +| 01 Data Layer | no | Class library `Azaion.Common`, linked into `admin-api` | +| 02 User Management | no | Class library `Azaion.Services` | +| 03 Auth & Security | no | Class library `Azaion.Services` | +| 04 Resource Management | no | Class library `Azaion.Services` | +| 05 Admin API | **yes** | Hosts the Minimal API process (`Azaion.AdminApi`) | + +## 3. `admin-api` — Dockerfile Specification + +| Property | Current value | Planned value (Step 7) | Rationale | +|----------|---------------|------------------------|-----------| +| Build base image | `mcr.microsoft.com/dotnet/sdk:10.0` (`--platform=$BUILDPLATFORM`) | unchanged | Matches restriction (.NET 10.0); cross-platform build supported | +| Runtime base image | `mcr.microsoft.com/dotnet/aspnet:10.0` | **pin by digest** in production (`@sha256:…`) | Restrictions forbid moving off `aspnet:10.0`; digest pin protects against silent base-image churn | +| Stages | `base` → `build` → `publish` → `final` | unchanged structure; non-root user added in `final` | Existing layout already follows multi-stage best practice | +| Working dir | `/app` | unchanged | Matches `start-container.sh` mounts | +| Exposed port | `8080` | unchanged | Bound by Kestrel via `ASPNETCORE_URLS=http://+:8080` | +| Container user | **root** (current) | `USER app` (UID 1654, GID 1654) | Closes security audit F-6 / AZ-518 (Drift C). Non-existing UID; matches the convention in `mcr.microsoft.com/dotnet/aspnet:8.0+` images | +| Mount points needing write | `/app/Content`, `/app/logs` | `chown app:app` both directories in the `final` stage | The new non-root user must own the dirs that are bind-mounted from the host | +| Build arg | `CI_COMMIT_SHA=unknown` | unchanged; populated by Woodpecker | Already wired; surfaces as `AZAION_REVISION` env var inside the container | +| OCI labels | none on the Dockerfile (CI adds three: `revision`, `created`, `source`) | move the three labels into the Dockerfile so local builds also carry them | Single source of truth; consistent labeling regardless of build origin | +| Health check | none | `HEALTHCHECK CMD curl -fsS http://localhost:8080/health \|\| exit 1` | Wires into the `/health` endpoint planned in Step 5 (Observability). Until that endpoint exists, fall back to the TCP probe already used in `docker-compose.test.yml`. | +| Entrypoint | `["dotnet", "Azaion.AdminApi.dll"]` | unchanged | Smallest-possible entrypoint; PID 1 is the .NET process | + +### Sketch (planning artifact — actual edits land in Step 7) + +``` +FROM mcr.microsoft.com/dotnet/aspnet:10.0@sha256: AS base +WORKDIR /app +EXPOSE 8080 +RUN groupadd -g 1654 app && useradd -u 1654 -g 1654 -m -d /home/app -s /sbin/nologin app \ + && mkdir -p /app/Content /app/logs && chown -R app:app /app + +FROM --platform=$BUILDPLATFORM mcr.microsoft.com/dotnet/sdk:10.0 AS build +ARG TARGETARCH +WORKDIR /app +COPY . . +RUN dotnet restore +WORKDIR /app/Azaion.AdminApi +RUN dotnet build "Azaion.AdminApi.csproj" -c Release -o /app/build + +FROM build AS publish +RUN arch=$([ "$TARGETARCH" = "amd64" ] && echo "x64" || echo "$TARGETARCH") && \ + dotnet publish "Azaion.AdminApi.csproj" -c Release -o /app/publish /p:UseAppHost=false --os linux --arch $arch + +FROM base AS final +ARG CI_COMMIT_SHA=unknown +ARG BUILD_DATE=unknown +ENV AZAION_REVISION=$CI_COMMIT_SHA +LABEL org.opencontainers.image.revision="$CI_COMMIT_SHA" +LABEL org.opencontainers.image.created="$BUILD_DATE" +LABEL org.opencontainers.image.source="https://git.azaion.com/azaion/admin" +COPY --from=publish --chown=app:app /app/publish /app/ +USER app +HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \ + CMD curl -fsS http://localhost:8080/health || exit 1 +ENTRYPOINT ["dotnet", "Azaion.AdminApi.dll"] +``` + +## 4. `e2e-runner` — Dockerfile Specification + +Existing `e2e/Dockerfile` is sufficient for cycle 1; no changes proposed. + +| Property | Value | Notes | +|----------|-------|-------| +| Base image | `mcr.microsoft.com/dotnet/sdk:10.0` (build + run) | SDK is required because the runner invokes `dotnet test` | +| Stages | `build` → run | Multi-stage to discard sources from the final image | +| Working dir | `/test` | Matches `docker-compose.test.yml` | +| Output dir | `/test-results` | Bind-mounted to `./e2e/test-results` on the host | +| User | root (acceptable — short-lived, no network exposure, no persistence beyond `/test-results`) | Non-root not required for one-shot CI containers | +| Loggers | `console`, `trx`, `xunit` | Last one feeds Woodpecker's parser | +| Entrypoint | `dotnet test Azaion.E2E.dll …` | Already present | + +## 5. Local Development — `docker-compose.yml` + +> Currently the project does **not** ship a local-dev compose file. Local devs run the API via `dotnet run` against a host Postgres on port 4312. We add `docker-compose.yml` in Step 7 (scripts) so newcomers get a one-command bring-up. + +```yaml +# docker-compose.yml — planning artifact for Step 7 +services: + api: + build: + context: . + dockerfile: Dockerfile + args: + CI_COMMIT_SHA: dev + image: azaion/admin:dev-local + env_file: .env + depends_on: + db: + condition: service_healthy + ports: + - "8080:8080" + volumes: + - ./.dev/content:/app/Content + - ./.dev/logs:/app/logs + healthcheck: + test: ["CMD", "curl", "-fsS", "http://localhost:8080/health"] + interval: 15s + timeout: 5s + retries: 5 + start_period: 30s + networks: [azaion-net] + + db: + image: postgres:16-alpine + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + volumes: + - ./e2e/db-init/00_run_all.sh:/docker-entrypoint-initdb.d/00_run_all.sh:ro + - ./env/db:/docker-entrypoint-initdb.d/sql:ro + - dev-db:/var/lib/postgresql/data + ports: + - "4312:5432" # match local-dev convention; non-standard host port + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres -d postgres"] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + networks: [azaion-net] + +volumes: + dev-db: + +networks: + azaion-net: + driver: bridge +``` + +Notes: + +- The DB schema and roles are bootstrapped from the same SQL files that the test-compose uses (`env/db/*.sql`), so `docker-compose.yml` and `docker-compose.test.yml` produce DB images with identical structure. +- `.dev/` is added to `.gitignore` and `.dockerignore` in Step 7. +- `db.ports` exposes `4312:5432` so a developer running the API outside Docker can still hit the same connection string defined in `.env`. + +## 6. Blackbox Test — `docker-compose.test.yml` (existing) + +The current file is already aligned with the Step 2 contract (`docker compose -f docker-compose.test.yml up --abort-on-container-exit --exit-code-from e2e-consumer`). Only one drift to log: + +| Drift | Description | Resolved In | +|-------|-------------|-------------| +| Drift H | `system-under-test.healthcheck` uses a raw bash TCP probe (`exec 3<>/dev/tcp/127.0.0.1/8080`). Once `/health` exists (Step 5), switch to the curl-based probe to actually test the application layer. | Step 5 + Step 7 | + +No structural change in cycle 1 — the file already brings up Postgres + SUT + e2e-runner on a private network and tears down on test exit. + +## 7. Image Tagging Strategy + +| Context | Tag format | Example | Notes | +|---------|------------|---------|-------| +| CI build (per push) | `$REGISTRY_HOST/$REGISTRY_IMAGE:${CI_COMMIT_BRANCH}-${TAG_SUFFIX}` | `docker.azaion.com/azaion/admin:dev-arm` | Existing convention from `.woodpecker/02-build-push.yml` | +| CI build (per push) — additional immutable tag | `$REGISTRY_HOST/$REGISTRY_IMAGE:${CI_COMMIT_SHA:0:12}-${TAG_SUFFIX}` | `docker.azaion.com/azaion/admin:a1b2c3d4e5f6-arm` | **NEW (Drift A resolution)** — gives every CI build an immutable tag the host scripts can pin | +| Production deploy | the SHA tag from above; never `latest` | `docker.azaion.com/azaion/admin:a1b2c3d4e5f6-arm` | Eliminates the host-pulls-`:latest` / CI-never-pushes-`:latest` mismatch | +| Local dev | `azaion/admin:dev-local` | — | Built by `docker-compose.yml`; never pushed | +| Multi-arch (future) | `:-amd` and `:-arm` (already matrix-prepared) | — | The Woodpecker matrix is wired; uncomment the `amd64` row when an amd agent is online | + +> Drift A resolution depends on a CI change (Step 3) and a script change (Step 7). The tag format itself is decided here. + +## 8. `.dockerignore` + +Existing `.dockerignore` is sufficient; no changes proposed in cycle 1. It already excludes `bin/`, `obj/`, `.env`, `.git`, IDE folders, `Dockerfile*`, and compose files. The only addition required by the new local-dev compose is `.dev/` — added in Step 7. + +``` +.dev +**/.dockerignore +**/.env +**/.git +**/.gitignore +**/.project +**/.settings +**/.toolstarget +**/.vs +**/.vscode +**/.idea +**/*.*proj.user +**/*.dbmdl +**/*.jfm +**/azds.yaml +**/bin +**/charts +**/docker-compose* +**/Dockerfile* +**/node_modules +**/npm-debug.log +**/obj +**/secrets.dev.yaml +**/values.dev.yaml +LICENSE +README.md +``` + +## 9. Self-verification + +- [x] Every component has a Dockerfile specification (only Admin API ships; libraries explicitly excluded with rationale). +- [x] Multi-stage builds specified for every production image. +- [x] Non-root user planned for `admin-api` (Drift C closed in spec; code change in Step 7). +- [x] Health check defined for every long-running service (real `/health` planned in Step 5; TCP fallback documented for the interim). +- [x] `docker-compose.yml` covers all components + Postgres dependency. +- [x] `docker-compose.test.yml` already enables black-box testing; one observation logged (Drift H). +- [x] `.dockerignore` defined and reviewed (one addition planned: `.dev/`). + +## 10. Drifts Logged Here (carried forward) + +| ID | Severity | Description | Resolved In | +|----|----------|-------------|-------------| +| C | Medium | `Dockerfile` final stage runs as root → add `USER app` (UID 1654) | Step 7 | +| G | Low | Unused `docker.test/Dockerfile` placeholder | Step 7 (delete) | +| H | Low | `docker-compose.test.yml` health check is TCP-only; upgrade to `/health` once available | Step 5 + Step 7 | diff --git a/_docs/04_deploy/deploy_scripts.md b/_docs/04_deploy/deploy_scripts.md new file mode 100644 index 0000000..6c7db03 --- /dev/null +++ b/_docs/04_deploy/deploy_scripts.md @@ -0,0 +1,196 @@ +# Azaion Admin API — Deployment Scripts + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: shipped (this is the only doc that matches concrete files in `scripts/` and `secrets/`). + +## 1. Overview + +| Script | Purpose | Location | +|--------|---------|----------| +| `deploy.sh` | Main orchestrator (pull → stop → start → health) | `scripts/deploy.sh` | +| `pull-images.sh` | `docker login` + `docker pull` the target image | `scripts/pull-images.sh` | +| `stop-services.sh` | Graceful stop + record rollback target | `scripts/stop-services.sh` | +| `start-services.sh` | `docker run` with the materialized env file and bind mounts | `scripts/start-services.sh` | +| `health-check.sh` | Poll `/health/ready` until 200 or timeout | `scripts/health-check.sh` | +| `smoke.sh` | 6 critical-path checks against the **public** URL | `scripts/smoke.sh` | +| `_lib.sh` | Shared logging + env-overlay helpers | `scripts/_lib.sh` (sourced, not executed) | +| `run-tests.sh` | Existing — runs the docker-compose test suite locally | `scripts/run-tests.sh` | +| `run-performance-tests.sh` | Existing — runs k6 against the test compose stack | `scripts/run-performance-tests.sh` | + +## 2. Prerequisites + +On the **deploy host**: + +| Requirement | Why | +|-------------|-----| +| Docker 24+ | `docker pull`, `docker run`, `--restart unless-stopped` | +| `sops` (≥ 3.8) | Decrypt `secrets/.env` | +| `age` (≥ 1.1) | Backing crypto for sops | +| `curl` | Used by `health-check.sh` and `smoke.sh` | +| `jq` | Used by `smoke.sh` for JSON parsing | +| `/etc/azaion/age.key` (mode 0400) | Per-host age private key (see `secrets/README.md`) | + +On the **operator's machine** (only for `smoke.sh`): + +| Requirement | Why | +|-------------|-----| +| `curl`, `jq` | Same as host | +| Network access to the public URL | `BASE_URL` is the production / staging hostname | + +## 3. Environment Variables + +`scripts/_lib.sh` `load_env_overlay ` resolves variables in this order (later sources override earlier): + +1. `/.env` (if present — local-dev convenience; harmless on a prod host that has no `.env`) +2. `secrets/.public.env` (committed plain text; loaded with `set -a`) +3. `secrets/.env` (sops-decrypted to a tempfile, sourced, tempfile deleted on exit) +4. The shell environment that invoked `deploy.sh` (operator overrides) + +The complete variable inventory is `.env.example` at the repo root. Variables specifically consumed by these scripts: + +| Variable | Required by | Source | Notes | +|----------|-------------|--------|-------| +| `ENV` | `deploy.sh` | operator shell | `staging` or `production` | +| `REGISTRY_HOST`, `REGISTRY_IMAGE`, `REGISTRY_TAG` | pull / start | public env / operator | tag is the `-` immutable tag from `.woodpecker/02-build-push.yml` | +| `REGISTRY_USER`, `REGISTRY_TOKEN` | pull | encrypted env | optional; if both missing, assumes `docker login` was done out-of-band | +| `DEPLOY_CONTAINER_NAME`, `DEPLOY_HOST_PORT`, `DEPLOY_HOST_CONTENT_DIR`, `DEPLOY_HOST_LOGS_DIR` | stop / start | public env | identical for staging and prod by default | +| `ASPNETCORE_ConnectionStrings__AzaionDb`, `__AzaionDbAdmin`, `JwtConfig__Secret` | start | encrypted env | the API fail-fast checks these on boot | +| `ASPNETCORE_ResourcesConfig__*`, `JwtConfig__{Issuer,Audience,Lifetime}` | start | public env (defaults from `appsettings.json`) | only override if the env value differs from the appsettings default | +| `SOPS_AGE_KEY_FILE` | `_lib.sh` | host | defaults to `/etc/azaion/age.key` if unset | +| `SMOKE_ADMIN_EMAIL`, `SMOKE_ADMIN_PASSWORD` | `smoke.sh` | operator shell | dedicated smoke-test admin user; rotate as a regular admin password | + +## 4. Script details + +### `deploy.sh` + +**Usage**: + +```bash +ENV=staging ./scripts/deploy.sh +ENV=production ./scripts/deploy.sh +ENV=staging ./scripts/deploy.sh --rollback # uses scripts/.previous_tags.env +./scripts/deploy.sh --help +``` + +**Flow** (matches `_docs/04_deploy/deployment_procedures.md` §3 / §4): + +1. Validate `ENV` and required commands. +2. Load env overlay (public + sops-decrypted). +3. If `--rollback`: read `scripts/.previous_tags.env` → set `SHA_TAG` to `PREVIOUS_SHA_TAG`. +4. `pull-images.sh` (login + pull). +5. `stop-services.sh` (records the SHA of whatever was running; graceful stop with `docker stop -t 40`; remove). +6. `start-services.sh` (`docker run --restart unless-stopped --env-file --publish $DEPLOY_HOST_PORT:8080`). +7. `health-check.sh` (poll `/health/ready` with timeout). +8. Print success line with the running revision. + +**Failure handling**: any non-zero exit from a sub-script aborts `deploy.sh` (because `set -euo pipefail` propagates). The previously-recorded SHA in `.previous_tags.env` is unchanged, so `--rollback` after a failed deploy targets the version that was running BEFORE the failed attempt. + +### `pull-images.sh` + +- `docker login` only when both `REGISTRY_USER` and `REGISTRY_TOKEN` are set; otherwise warns and continues (assumes pre-auth). +- `docker pull $REGISTRY_HOST/$REGISTRY_IMAGE:$REGISTRY_TAG`. +- Logs the resolved `RepoDigests[0]` to give the operator an immutable identifier in the deploy log. + +### `stop-services.sh` + +- Reads `org.opencontainers.image.revision` from the running container (label set by the Dockerfile). +- Writes `scripts/.previous_tags.env`: + ``` + PREVIOUS_SHA_TAG=- + PREVIOUS_REVISION= + RECORDED_AT= + ``` +- `docker stop -t 40` then `docker rm -f`. +- If the container does not exist, logs and exits 0 (idempotent — first deploy on a new host should succeed). + +### `start-services.sh` + +- Materializes a runtime env file by filtering the current shell environment with `grep '^(ASPNETCORE_|AZAION_)'`. Registry credentials and deploy-host plumbing variables stay on the host and never enter the container. +- `mkdir -p` for the bind-mounted `Content/` and `logs/` dirs (idempotent). +- `docker run --detach --name --restart unless-stopped --env-file --publish --volume`. +- Logs the container ID and the running revision. + +### `health-check.sh` + +- One-shot check on `/health/live` first (3 s timeout). If this fails the container is wedged — fail fast. +- Polls `/health/ready` every `HEALTH_INTERVAL` (default 2 s) until 200 or `HEALTH_TIMEOUT` (default 60 s). +- Returns 0 on first 200; non-zero on timeout. + +### `smoke.sh` + +Six checks, each ≤ 10 s, against the public `BASE_URL`: + +1. `GET /health/live` (200) +2. `GET /health/ready` (200, best-effort — public URL may legitimately not expose this) +3. `POST /login` — extract JWT +4. `GET /users/current` (Bearer auth) +5. `GET /users` — count rows +6. `GET /resources/list` — sanity that filesystem-backed paths are reachable + +Smoke is intentionally lightweight; it does NOT exercise CRUD or detection-class endpoints (those are covered by E2E in CI). + +### `_lib.sh` + +Shared sourced library. Sourced via `. "$SCRIPT_DIR/_lib.sh"` from every script. NOT executable (lives at `scripts/_lib.sh` mode 0644). Contains: + +- `log_info` / `log_warn` / `log_error` / `die` +- `require_env ` / `require_cmd ` +- `load_env_overlay ` (the sops + age decryption pipeline) +- `container_exists`, `container_running`, `current_image_revision` + +## 5. Examples + +### First-ever staging deploy + +```bash +# On the staging host, as deploy operator: +cd /opt/azaion/admin # or wherever the repo is checked out +ENV=staging ./scripts/deploy.sh a1b2c3d4e5f6-arm +``` + +### Rolling back production after a bad deploy + +```bash +# Same host, immediately after the failed deploy: +ENV=production ./scripts/deploy.sh --rollback +``` + +### Running smoke from the operator workstation + +```bash +export BASE_URL=https://stage.admin.azaion.com +export SMOKE_ADMIN_EMAIL=ops-smoke@azaion.com +export SMOKE_ADMIN_PASSWORD=... # from the operator's password manager +./scripts/smoke.sh +``` + +### Local development against the dockerized stack + +The dev-time compose was deferred (Drift K-adjacent). Until it lands, run the API directly: + +```bash +# Postgres on host port 4312 (per env/db/00_install.sh) +dotnet run --project Azaion.AdminApi +``` + +## 6. Common script properties + +All scripts: + +- Use `#!/usr/bin/env bash` with `set -euo pipefail`. +- Support `--help` / `-h` for usage. +- Source `_lib.sh` for logging and env-overlay helpers. +- Are idempotent where possible (running `deploy.sh` twice with the same SHA tag is a no-op for `pull-images.sh`, recreates the container in `stop`/`start`, and re-checks health). +- Echo to stderr for log lines (so stdout from a sub-process can still be piped). + +## 7. What is NOT shipped in cycle 1 + +- Remote SSH wrapper. The deploy procedure assumes the operator runs the script on the target host. A `--remote $DEPLOY_HOST` mode is recorded as **Drift O** (carried forward). +- Slack notifications from inside the scripts. Notifications happen out-of-band per `_docs/04_deploy/observability.md` §5. +- Database migration step. Migrations are applied manually with `psql` per `_docs/04_deploy/environment_strategy.md` §4 (Drift J). + +## 8. Related artifacts + +- Postmortem template: `_docs/06_metrics/postmortem_template.md` +- Procedures: `_docs/04_deploy/deployment_procedures.md` +- Environment strategy: `_docs/04_deploy/environment_strategy.md` +- secrets/ folder onboarding: `secrets/README.md` diff --git a/_docs/04_deploy/deployment_procedures.md b/_docs/04_deploy/deployment_procedures.md new file mode 100644 index 0000000..fb7cd61 --- /dev/null +++ b/_docs/04_deploy/deployment_procedures.md @@ -0,0 +1,195 @@ +# Azaion Admin API — Deployment Procedures + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: planning artifact (the executable scripts referenced here land in Step 7). + +## 1. Deployment Strategy + +**Pattern**: **stop-and-start with pre-pulled image** (single-container, single-host). + +**Rationale**: + +- Topology is one Docker host per environment running one `azaion.api` container behind Nginx. There is no orchestrator, no replica set, no load balancer beyond Nginx itself. +- Blue-green requires either two listening ports + Nginx switch, or two hosts. Cycle-1 budget does not include either. Recorded as **Drift N** for a future cycle. +- Rolling/canary is meaningless with one replica. +- The realistic SLO for cycle 1 is **brief (< 30 s) downtime per deploy**, mitigated by deploying in low-traffic windows. The procedure pre-pulls the image so the actual stop-start gap is the time it takes for the new container to clear `/health/ready`, not image-download time. + +**Zero-downtime in production**: not achieved in cycle 1. Documented and acknowledged. + +### Graceful Shutdown + +| Signal | Behavior | +|--------|----------| +| `SIGTERM` (`docker stop`) | ASP.NET Core stops accepting new requests, waits up to `HostOptions.ShutdownTimeout` for in-flight requests, then exits. | +| `ShutdownTimeout` | Set to **30 seconds** in `Program.cs` (`services.Configure(o => o.ShutdownTimeout = TimeSpan.FromSeconds(30))`). | +| `docker stop` grace | Use `docker stop -t 40` so Docker waits 40 s before sending `SIGKILL`, leaving 10 s of headroom over the app's 30 s. | + +This wiring lands in Step 7 (Dockerfile + small `Program.cs` change). + +### Database Migration Ordering + +Conventions inherited from the Environment Strategy (§4 of `environment_strategy.md`): + +1. Apply the new `env/db/NN_*.sql` file **before** deploying the matching code. Because every migration is backward-compatible, the old container keeps working against the new schema. +2. After the deploy is healthy, optionally apply a follow-up `NN+1_*.sql` for cleanup (e.g., dropping a tombstone column once no code reads it). +3. Production migrations run on staging first and soak ≥ 24 h before promotion. +4. Migration is performed by the operator with `psql -h -p 4312 -U azaion_superadmin -d azaion -f env/db/NN_xxx.sql`. Logged in the deploy ticket. + +## 2. Health Checks + +These endpoints are introduced in Step 7 (anonymous, internal-only — see Observability §3.1 / §7). + +| Check | Type | Endpoint | Interval | Failure threshold | Action | +|-------|------|----------|----------|-------------------|--------| +| Docker liveness | HTTP GET (in-container, via `Dockerfile` `HEALTHCHECK`) | `/health/live` | 30 s | 3 consecutive | Docker marks container `unhealthy`; **does NOT auto-restart** in cycle 1 (no `--restart=on-failure` policy in `start-container.sh`) | +| Nginx readiness | HTTP GET (upstream `health_check`) | `/health/ready` | 5 s | 3 consecutive | Nginx pulls upstream → 503 to clients (no silent traffic loss) | +| Deploy-script startup | HTTP GET (polling) | `/health/ready` | 2 s | up to 30 attempts (~60 s) | `scripts/deploy.sh` aborts and triggers rollback | + +### Health Check Response Contract + +| Endpoint | 200 condition | 5xx condition | Headers | +|----------|---------------|---------------|---------| +| `/health/live` | Process is responsive (always — short-circuits before any dependency call) | Never returns 5xx unless the process is wedged | `Cache-Control: no-store` | +| `/health/ready` | `SELECT 1` succeeds against both `AzaionDb` (reader) and `AzaionDbAdmin` (writer) within a 2 s timeout | Either DB query fails or times out → 503 | `Cache-Control: no-store` | + +`/health/ready` does NOT exercise the filesystem (`Content/`, `logs/`) — a transient `EACCES` there should not yank the upstream. It surfaces in metrics (`resource_upload_failures_total`) and alerts (Observability §5) instead. + +## 3. Staging Deployment + +Triggered manually by the operator from the staging host or from a Woodpecker manual workflow. + +``` +1. Pre-flight — operator on local machine + a. Confirm CI green for the target SHA on the `stage` branch. + b. Run `dotnet list package --vulnerable` against the target commit (CI does this too — local is a sanity check). + c. Confirm any DB migration in env/db/ for this SHA has been reviewed. + +2. DB migration (if any) — operator SSH to staging host + psql -h localhost -p 4312 -U azaion_superadmin -d azaion -f env/db/NN_.sql + +3. Deploy — operator runs scripts/deploy.sh on staging host + ENV=staging ./scripts/deploy.sh + # script: docker pull → stop -t 40 → rm → run --env-file .env → poll /health/ready + +4. Verify — automatic in scripts/deploy.sh + - /health/ready returns 200 within 60 s + - Container `docker inspect` healthcheck status is `healthy` + - `docker logs --tail=80` contains no `Error` lines from the last 60 s + +5. Smoke tests — operator runs from local machine + BASE_URL=https://stage.admin.azaion.com ./scripts/smoke.sh + # 6 critical-path checks: /login (admin), GET /users (paginates), GET /classes, + # GET /resources/list, /health/ready, JWT lifecycle. + +6. Soak — observe dashboard for ≥ 24 h before promoting +``` + +If any step fails → §5 Rollback. + +## 4. Production Deployment + +``` +1. Approval — required: ops lead OR backend lead + - Reference the staging soak completion timestamp. + - Reference the cycle's deploy ticket (AZ-NNN) and CI run URL. + +2. Pre-deploy checks (operator on local machine) + [ ] Staging smoke tests passed (§3 step 5). + [ ] Staging soaked ≥ 24 h with no Critical/High alerts. + [ ] CI green for the same SHA on the `main` branch. + [ ] Image-scan report for the SHA shows zero High/Critical (Woodpecker artifact). + [ ] DB migration plan recorded in the deploy ticket. + [ ] Rollback target SHA is recorded (the SHA currently running in prod — `docker inspect azaion.api | jq -r '.[0].Config.Labels."org.opencontainers.image.revision"'`). + [ ] On-call engineer is reachable for the next 30 min. + +3. DB migration (if any) — operator SSH to prod host + psql -h localhost -p 4312 -U azaion_superadmin -d azaion -f env/db/NN_.sql + +4. Deploy — operator runs scripts/deploy.sh on prod host + ENV=production ./scripts/deploy.sh + +5. Verify — automatic + operator + - /health/ready returns 200 within 60 s. + - Container `docker inspect` healthcheck status `healthy`. + - Operator hits `/login` with admin creds and a known user list query. + +6. Monitor — operator observes dashboards for ≥ 15 minutes + - Error rate (5xx) stays < 1%. + - P95 latency stays within 2× cycle-1 baseline (66 ms /login, 305 ms /users). + - No Critical or High alerts fire. + +7. Finalize + - Update deploy ticket with start/stop timestamps and image SHA. + - Post `:white_check_mark: prod deploy: ` to Slack #azaion-ops. +``` + +## 5. Rollback Procedures + +### Trigger Criteria (any one) + +- `/health/ready` fails for ≥ 60 s after deploy. +- Error rate (5xx) > 5 % for 5 minutes within the 15-minute observation window. +- Any Critical alert fires within 15 minutes of deploy. +- Operator's manual call (e.g. business-impacting bug surfaced by smoke tests). + +### Rollback Steps (≤ 5 minutes) + +``` +1. Capture state — operator on the affected host + docker logs azaion.api --tail=500 > /var/log/azaion/rollback-$(date -u +%Y%m%dT%H%M%SZ).log + docker inspect azaion.api > /var/log/azaion/rollback-$(date -u +%Y%m%dT%H%M%SZ).inspect.json + +2. Re-deploy previous SHA — operator + ENV=production ./scripts/deploy.sh + # The SHA tag was recorded in step 2 of the deploy procedure. + +3. DB rollback (if a migration was applied this deploy) + - If reversible (drop column, drop index): run the agreed reverse SQL recorded in the deploy ticket. + - If irreversible (added column, table): leave the schema as-is — the previous code is backward-compatible (rule §1.3) so the extra schema is inert. + - If data was migrated destructively: STOP, escalate to backend lead. Restore from backup if necessary. + +4. Verify — same checks as deploy §5 +5. Notify — operator posts ":rotating_light: prod rollback: " to Slack #azaion-ops with the deploy ticket link. +6. Post-mortem — schedule within 24 hours; required artifact: timeline + root cause + prevention. +``` + +### Post-Mortem (required) + +Template lives in `_docs/06_metrics/postmortem_template.md` (added in Step 7). Required sections: + +- Timeline (UTC), with deploy SHA and rollback SHA. +- Root cause (one sentence + evidence link). +- Detection — how was it caught? Which alert? Which probe? Which user report? +- Repair — what fixed it? +- Prevention — concrete change (test, alert, procedure step) with an owner and a target date. + +## 6. Deployment Checklist (per release) + +Copy this into the deploy ticket; tick before flipping `prod`: + +``` +[ ] CI green on target SHA (01-test + 02-build-push, all matrix entries) +[ ] Image scan report: zero High/Critical CVEs (Woodpecker artifact) +[ ] Dependency audit (`dotnet list package --vulnerable`): zero High/Critical +[ ] Image SHA tag exists in registry: docker manifest inspect $REGISTRY_HOST/azaion/admin:-arm +[ ] DB migration (if any) reviewed by backend lead; rollback SQL recorded if reversible +[ ] secrets/staging.env / secrets/production.env decrypts cleanly on the target host +[ ] Health endpoints respond 200 in current production (sanity baseline) +[ ] Monitoring alerts armed (no silenced alerts that would mask the deploy) +[ ] Rollback target SHA recorded +[ ] Stakeholders notified (Slack #azaion-ops, expected window) +[ ] On-call engineer reachable for the next 30 min +``` + +## 7. Drifts Logged Here + +| ID | Severity | Description | Carried Forward | +|----|----------|-------------|-----------------| +| N (NEW) | Medium | No zero-downtime deploy strategy — single-container topology produces ~30 s gap per deploy | Future cycle: blue-green via dual ports + Nginx upstream switch | + +## 8. Self-verification + +- [x] Deployment strategy chosen (stop-and-start) with explicit rationale and acknowledgement that zero-downtime is deferred (Drift N). +- [x] Graceful-shutdown contract specified (`HostOptions.ShutdownTimeout` 30 s, `docker stop -t 40`). +- [x] Health checks defined (liveness, readiness, startup) with exact response contract and Cache-Control header. +- [x] Rollback trigger criteria + 6-step rollback procedure + post-mortem template requirement. +- [x] Deployment checklist complete (10 items) and explicitly references the SHA tag (Drift A resolution from Step 3). diff --git a/_docs/04_deploy/environment_strategy.md b/_docs/04_deploy/environment_strategy.md new file mode 100644 index 0000000..3c66c99 --- /dev/null +++ b/_docs/04_deploy/environment_strategy.md @@ -0,0 +1,127 @@ +# Azaion Admin API — Environment Strategy + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: planning artifact (no scripts; concrete wiring lands in Step 7). + +## 1. Environments + +| Environment | Purpose | Infrastructure | Data Source | +|-------------|---------|----------------|-------------| +| **Development** | Local developer workflow on macOS / Linux. | Either bare `dotnet run` against host Postgres (port 4312) **or** the new `docker-compose.yml` planned in Step 2 (API + Postgres on a private Docker network). | Empty database; SQL files under `env/db/` create roles + schema; no fixtures. | +| **Test (CI)** | Black-box tests in CI and locally via `scripts/run-tests.sh`. | `docker-compose.test.yml` — API + Postgres + e2e-runner on a Docker network. | Functional fixtures from `e2e/db-init/00_run_all.sh` + `99_test_seed.sql`. | +| **Staging** | Pre-production validation. | Self-hosted Linux server, single Docker host, behind Nginx reverse proxy on `stage.admin.azaion.com`. Mirrors prod topology and Postgres major version. | Anonymized snapshot of production (PII scrubbed by an offline script before import). | +| **Production** | Live system. | Self-hosted Linux server, single Docker host, behind Nginx reverse proxy on `admin.azaion.com`. | Live data; daily off-host backups. | + +> Test is added as a first-class environment because cycle 1 already exercises it (`docker-compose.test.yml`). The deploy template lists three; we list four to match reality. + +## 2. Environment Variables + +### Source of Truth + +The complete variable inventory lives in `.env.example` at the repo root (Step 1, 24 entries). This document does NOT duplicate that table — it only specifies, per environment, **where each variable is sourced**. + +### Per-environment sourcing + +| Variable group | Development | Test (CI) | Staging | Production | +|----------------|-------------|-----------|---------|------------| +| `ASPNETCORE_ENVIRONMENT` | `.env` (`Development`) | docker-compose `environment:` (`Development`) | docker-compose / `--env-file` (`Staging`) | docker-compose / `--env-file` (`Production`) | +| `ASPNETCORE_URLS` | `.env` | compose | host `.env` (rendered from sops) | host `.env` (rendered from sops) | +| `ConnectionStrings__*` | `.env` (real local creds) | compose (literal — accepted F-10) | **sops-encrypted file in git** → decrypted on host at deploy time | same as staging | +| `JwtConfig__Secret` | `.env` (dev-only literal) | compose (literal — accepted F-10) | **sops-encrypted** | **sops-encrypted** | +| `JwtConfig__{Issuer,Audience,Lifetime}` | appsettings defaults | appsettings defaults | host `.env` if non-default | host `.env` if non-default | +| `ResourcesConfig__*` | appsettings defaults | compose | host `.env` if non-default | host `.env` if non-default | +| `DEPLOY_*`, `REGISTRY_TAG` | `.env` (developer machine) | n/a | passed to `scripts/deploy.sh` from operator's shell or CI manual trigger | same | +| `REGISTRY_USER`, `REGISTRY_TOKEN` | empty in dev `.env` | Woodpecker secrets `registry_user` / `registry_token` | Woodpecker secrets (CI deploy) or operator's shell (manual deploy) | same | +| `CI_COMMIT_SHA` | unset → image label `unknown` | Woodpecker built-in | Woodpecker built-in | Woodpecker built-in | + +### Variable Validation (fail-fast) + +The Admin API already does this for the most security-critical variable: + +```csharp +var jwtConfig = builder.Configuration.GetSection(nameof(JwtConfig)).Get(); +if (jwtConfig == null || string.IsNullOrEmpty(jwtConfig.Secret)) + throw new Exception("Missing configuration section: JwtConfig"); +``` + +The deploy plan **adds** the same fail-fast check for connection strings during Step 7 wiring (a one-time `_ = configuration.GetConnectionString("AzaionDb") ?? throw …` plus the same for `AzaionDbAdmin`, executed during `WebApplication` build). Without the check, a missing variable currently surfaces only on the first DB call, which is too late. + +> Static / lookup-style variables (`ResourcesConfig__*`, `JwtConfig__{Issuer,Audience,Lifetime}`) keep their `appsettings.json` defaults in every environment unless an override is required. We do NOT add fail-fast checks for them. + +## 3. Secrets Management + +### Decision + +| Environment | Method | Tool | +|-------------|--------|------| +| Development | `.env` file | committed `.env.example` + per-developer `.env` (git-ignored) | +| Test (CI) | docker-compose `environment:` literals | accepted as test-only (security audit F-10) | +| Staging | git-tracked encrypted file | **sops + age** | +| Production | git-tracked encrypted file | **sops + age** | + +### Why sops + age (not Vault, not Woodpecker secrets, not hand-edited `.env`) + +Constraints: self-hosted, no cloud account, single ops engineer, currently hand-editing `.env` on the host. + +| Option | Pros | Cons | Verdict | +|--------|------|------|---------| +| sops + age (chosen) | Secrets versioned in git, encrypted at rest, decrypted on the host with a single age key. No new infra. Works offline. | Requires per-environment age keypair stored on the host outside git. Manual key rotation. | ✅ pragmatic for this team size and topology | +| HashiCorp Vault (self-hosted) | Dynamic DB creds, audit log, fine-grained ACL, KV v2. | Adds a service to operate, monitor, back up. Single-engineer ops budget cannot absorb it now. | ⏳ revisit in a future cycle when ops capacity grows | +| Woodpecker secrets exported into runtime container | Reuses existing secret store. | Couples runtime config to CI; secrets are not visible/auditable outside Woodpecker UI; cannot run the container outside CI without manually exporting them. | ❌ leaks the CI/runtime boundary | +| Hand-edited host `.env` (status quo) | Zero new tooling. | No version history, no encryption, no review trail. Single point of failure if the file is lost; security audit can't track changes. | ❌ status quo we are leaving behind (Drift B) | + +### sops + age conventions for this repo + +``` +secrets/ +├── .sops.yaml # routes secrets/staging.env / production.env to the right age recipients +├── staging.env # SOPS-encrypted; safe to commit +└── production.env # SOPS-encrypted; safe to commit +``` + +- `.sops.yaml` declares two age recipients: `recipient_staging` and `recipient_production` (public keys). +- The matching age **private** keys live on each host at `/etc/azaion/age.key`, mode `0400`, owned by root. They are NEVER committed. +- `scripts/deploy.sh` (Step 7) runs `SOPS_AGE_KEY_FILE=/etc/azaion/age.key sops -d secrets/${env}.env > /tmp/azaion.env` and feeds it to `docker run --env-file`. +- All staging/production env values that are NOT secret (e.g. `DEPLOY_HOST_PORT`, `REGISTRY_TAG`) live in plain-text `secrets/staging.public.env` / `secrets/production.public.env` next to the encrypted file, also git-tracked. Loaded before the decrypted overlay. + +### Rotation policy + +| Secret | Rotation cadence | Procedure | +|--------|------------------|-----------| +| Postgres `azaion_admin` / `azaion_reader` passwords | every 90 days, on operator schedule | `ALTER ROLE … WITH PASSWORD …` → re-encrypt `production.env` → `scripts/deploy.sh` | +| JWT `JwtConfig__Secret` | every 180 days, AND on any suspected leak | re-encrypt → deploy. **All issued tokens become invalid** — communicate maintenance window. | +| `azaion_superadmin` password | every 365 days, AND on owner change | manual; not used by the running app, only by DB migrations | +| Registry `REGISTRY_TOKEN` | every 90 days OR on CI compromise | rotate registry credential → update Woodpecker secret `registry_token` → re-encrypt `production.env` if also referenced there | +| age private key (`/etc/azaion/age.key`) | every 365 days OR on host compromise | generate new key → add public recipient to `.sops.yaml` → `sops updatekeys secrets/*.env` → distribute new private key out-of-band → remove old recipient | + +## 4. Database Management + +| Environment | Type | Migrations | Data | Backup | +|-------------|------|------------|------|--------| +| Development | Local Postgres on host (port 4312) **or** dockerized Postgres from `docker-compose.yml` | `env/db/*.sql` applied manually by developer the first time, then `*_users_email_unique.sql`-style additive scripts run with `psql` on demand | empty | none | +| Test (CI) | Postgres 16-alpine from `docker-compose.test.yml` | `env/db/*.sql` mounted into `/docker-entrypoint-initdb.d/sql/`, ordered by `00_run_all.sh` | `99_test_seed.sql` (functional) + 500 perf users injected by `scripts/run-performance-tests.sh` when needed | none — `down -v` between runs | +| Staging | Same Postgres major (16) on the staging server, port 4312, `azaion` database | `env/db/*.sql` applied **manually under change control** via `psql -U azaion_superadmin`. New migrations land in the same numeric-prefix sequence (`07_*.sql`, `08_*.sql`, …) | anonymized prod snapshot, refreshed on demand | nightly `pg_dump` snapshot retained 14 days | +| Production | Same Postgres 16 on prod server | Same as staging; **migration must be applied to staging first**, observed for ≥ 24 h, then promoted to prod with operator approval | live | nightly `pg_dump` retained 30 days; weekly snapshot retained 12 weeks; off-host copy via `rsync` | + +### Migration rules (cycle 1) + +The project does NOT use an ORM migration framework (linq2db; restrictions.md). The conventions below replace it: + +1. **Numeric-prefix ordering** — every new migration is added as `env/db/NN_.sql` where `NN` continues the existing sequence. The current sequence is `01..06`; the next is `07_*.sql`. +2. **Forward-only by default**. Reversibility is provided by the off-host backup, NOT by hand-written DOWN scripts. The existing files (`02_structure.sql`, `03_add_timestamp_columns.sql`, `04_detection_classes.sql`, `06_users_email_unique.sql`) follow this pattern; we keep it. +3. **Backward-compatible deploys** — every schema change must be safe to apply BEFORE the matching code is deployed (additive change → deploy code → cleanup change in a later release). The cycle 1 example: `06_users_email_unique.sql` was applied first; the `RegisterUser` change to translate `23505` came after. AZ-197's `User.Hardware` column was kept as a tombstone instead of dropped, for the same reason. +4. **Production migrations need approval** — operator manually runs the SQL on prod after staging soak. No automatic CI execution against prod in cycle 1 (Drift J — automation is a future cycle's work). + +### Drifts logged here + +| ID | Severity | Description | Resolved In | +|----|----------|-------------|-------------| +| B | Medium | No secret manager (status quo: hand-edited host `.env`) | **Resolved in spec** — sops + age (§3); concrete files + script in Step 7 | +| J | Low (NEW) | DB migrations applied manually on staging/prod; no automation | **Carried forward** to a future cycle | + +## 5. Self-verification + +- [x] Four environments (Dev, Test/CI, Staging, Production) defined with purpose, infrastructure, and data source. +- [x] Environment variable sourcing matrix references `.env.example` (Step 1) without duplicating it. +- [x] No literal secrets in this document — only variable names and tool names. +- [x] Secret manager chosen for staging/production (sops + age) with rotation policy. +- [x] Database strategy per environment, including the explicit no-ORM-migrations convention. diff --git a/_docs/04_deploy/observability.md b/_docs/04_deploy/observability.md new file mode 100644 index 0000000..942cf8b --- /dev/null +++ b/_docs/04_deploy/observability.md @@ -0,0 +1,204 @@ +# Azaion Admin API — Observability + +**Date**: 2026-05-13 · **Cycle**: 1 · **Status**: planning artifact (no code changes; concrete wiring lands in Step 7). + +## 1. Current State (audit) + +| Pillar | Today | Gap | +|--------|-------|-----| +| Logging | Serilog 4.1.0 → Console + rolling file `logs/log.txt` (daily); MinimumLevel `Information`; FromLogContext enrichment | No structured fields beyond defaults; one unstructured `LogInformation($"…")` in `ResourcesService.SaveResource` (security audit F-12); SQL trace bypasses Serilog (`Console.WriteLine`); no correlation IDs | +| Metrics | none | No `/metrics` endpoint; no system, app, or business metrics | +| Tracing | none | No OpenTelemetry, no W3C trace context | +| Health checks | none in code; `docker-compose.test.yml` uses raw TCP probe | No `/health` endpoint (Drift H from Step 2 + skill self-verification) | +| Alerting | none | No alerts wired to any channel | + +This step closes the planning gap; implementation lands incrementally — `/health` and structured logging in cycle 1 (Step 7), metrics + tracing in a later cycle (Drift K). + +## 2. Logging + +### 2.1 Format + +Structured JSON to **stdout/stderr only** in containers. The current rolling-file sink is **dropped from the production runtime** (and the `/app/logs` bind mount becomes optional) because: + +- Container logs should be collected by the platform, not the app. +- A bind-mounted file silently fills the host disk when log rotation lags. +- We currently have no log shipper, so logs already live only in `docker logs` for ops triage. + +The existing console sink stays. The file sink is kept ONLY in `Development` (gated by `ASPNETCORE_ENVIRONMENT`). + +```json +{ + "timestamp": "2026-05-13T06:48:01.123Z", + "level": "Information", + "service": "azaion.admin-api", + "revision": "a1b2c3d4e5f6", + "correlation_id": "0HMU7…", + "user_id": null, + "message": "User registered", + "context": { + "endpoint": "POST /users", + "duration_ms": 47 + } +} +``` + +Achieved by adding `Serilog.Formatting.Compact.RenderedCompactJsonFormatter` to the console sink and three enrichers: + +| Enricher | Source | Purpose | +|----------|--------|---------| +| `FromLogContext` | already present | scoped properties | +| `Serilog.Enrichers.Environment` (new) | `ENV` vars | `service`, `revision` (`AZAION_REVISION`) | +| `Serilog.AspNetCore.RequestLoggingOptions` (new) | ASP.NET pipeline | request `correlation_id` from `Activity.Current.TraceId` (or generated UUID v7 if no Activity) | + +### 2.2 Log Levels + +| Level | Usage | Examples in this codebase | +|-------|-------|---------------------------| +| `Error` | Unhandled exceptions, infra failures | DB connection failure, sops decrypt failure on host | +| `Warning` | Business exception caught | Existing `BusinessExceptionHandler` already does this — keep as-is | +| `Information` | Significant business events | Login, RegisterUser, RegisterDevice, role change, resource upload, detection-class CRUD | +| `Debug` | Diagnostic detail | Request/response payloads (dev only — never in production); query parameters | + +### 2.3 Retention + +| Environment | Destination | Retention | +|-------------|-------------|-----------| +| Development | console + `logs/log.txt` (rolling daily) | 7 daily files (Serilog default) | +| Test (CI) | console (captured by Woodpecker UI) | 14 days (Woodpecker artifact retention) | +| Staging | container stdout → `journald` on the host | 7 days; `journalctl --vacuum-time=7d` cron | +| Production | container stdout → `journald` on the host | 30 days; `journalctl --vacuum-time=30d` cron | + +> A central log aggregator (Loki / OpenSearch) is **out of scope for cycle 1** — host `journald` is the entire pipeline. Recorded as **Drift L**. + +### 2.4 PII Rules + +| Rule | Implementation | +|------|----------------| +| Never log passwords | `LoginRequest.Password`, `RegisterUserRequest.Password`, `GetResourceRequest.Password`, the response body of `POST /devices` (plaintext one-shot password). Add a `[Serilog.Sensitive]`-style helper or a `Destructure.ByTransforming(t => …)` per DTO. | +| Never log JWT tokens | The `/login` response body is logged today only by `BusinessExceptionHandler` on failure, which doesn't include the body. Verify in Step 7 that no request-logger middleware logs response bodies. | +| Mask emails | Use last-4 + `@domain` form for INFO-level logs (`***123@example.com`); full email allowed at DEBUG only. The `BusinessExceptionHandler` log line `"Caught BusinessException: {Message}"` may include emails embedded in messages — tightened in Step 7. | +| User IDs | `User.Id` is an opaque GUID — safe to log; use it instead of email in correlation. | + +## 3. Metrics + +### 3.1 Endpoint + +`GET /metrics` exposing Prometheus exposition format. Add via `prometheus-net.AspNetCore` 8.x (latest stable for .NET 10 baseline; verify version against released wheel before wiring). + +> Exposure boundary: `/metrics` MUST NOT be reachable from the public CORS allow-list. The Nginx reverse proxy on `admin.azaion.com` will expose only `/login`, `/users*`, `/devices`, `/resources*`, `/classes*`, `/health`. `/metrics` and `/swagger` stay on the internal interface (separate Nginx server block bound to the management VLAN, OR `localhost`-only listener). + +### 3.2 Metrics + +| Metric | Type | Source | Labels | +|--------|------|--------|--------| +| `http_requests_total` | Counter | ASP.NET request pipeline | `method`, `endpoint`, `status_code` | +| `http_request_duration_seconds` | Histogram | ASP.NET request pipeline | `method`, `endpoint` | +| `http_requests_in_progress` | Gauge | ASP.NET request pipeline | `method` | +| `db_command_duration_seconds` | Histogram | linq2db trace hook | `operation` (`select`/`insert`/`update`/`delete`) | +| `db_command_failures_total` | Counter | linq2db trace hook | `operation`, `sqlstate` | +| `auth_login_failures_total` | Counter | `AuthService.ValidateUser` exception path | `reason` (`unknown_user`, `bad_password`, `disabled`) | +| `business_exceptions_total` | Counter | `BusinessExceptionHandler` | `error_code` (the existing `ExceptionEnum`) | +| `resource_upload_bytes_total` | Counter | `ResourcesService.SaveResource` | `data_folder` | +| `resource_upload_failures_total` | Counter | same | `reason` | +| `resource_download_bytes_total` | Counter | `ResourcesService.GetEncryptedResource` | `data_folder` | +| `detection_classes_total` | Gauge | refresh on CRUD | none | +| `users_active_total` | Gauge | refresh on CRUD + on a 5-min timer | `role` | +| Process / runtime | (auto) | `prometheus-net.DotNetRuntime` | gen0/1/2 GC, JIT, threadpool, etc. | + +### 3.3 System Metrics + +CPU, RSS, file descriptors, network I/O — collected by **node-exporter** running on the host as a sibling container. The Admin API itself does NOT export host-level metrics. + +### 3.4 Business Metrics + +Mapped to the verified ACs in `_docs/02_document/tests/blackbox-tests.md`. Cycle-1 cut: `users_active_total` (AC-01..AC-12 user lifecycle) and `detection_classes_total` (AZ-513). Resource-related business metrics deferred until the resource flow is exercised by real users post-AZ-197. + +### 3.5 Collection + +| Setting | Value | +|---------|-------| +| Scrape interval | 15s (Prometheus default) | +| Scrape source | `node-exporter` for host; the Admin API container for app metrics | +| Storage | local Prometheus on the host, retention 14 days (cycle 1 budget) | +| Visualization | local Grafana, single dashboard (§6) | + +## 4. Distributed Tracing + +**Cycle 1**: scaffold only — produce a trace ID per request, propagate via `traceparent` (W3C), and emit it as the `correlation_id` field in JSON logs. **Do NOT yet** ship spans to a collector — there is no Jaeger / Tempo running, and the Admin API has no downstream services to trace into. Tracing pays back its cost when there's a chain to follow; cycle 1 has none. + +| Setting | Value | +|---------|-------| +| SDK | `OpenTelemetry.Extensions.Hosting` + `OpenTelemetry.Instrumentation.AspNetCore` | +| Propagation | W3C Trace Context (`traceparent`) — auto when `OpenTelemetry.Instrumentation.AspNetCore` is registered | +| Sampling | 100% in dev/staging, 10% in production (deferred — no exporter yet) | +| Span naming | `.` — service `azaion.admin-api`, operation ` ` | +| Exporter | none in cycle 1 (logs only) | + +> Recorded as **Drift M** — wire a Tempo / Jaeger exporter once a downstream service exists. + +## 5. Alerting + +| Severity | Response time | Conditions for this service | Channel | +|----------|---------------|------------------------------|---------| +| Critical | 5 min | `up{job="admin-api"} == 0` for 1 min · `/health` fails for 2 min · `business_exceptions_total{error_code="DbFailure"}` rate > 1/s for 1 min | Slack `#azaion-ops` + on-call email (cycle 1 — PagerDuty deferred until on-call rotation exists) | +| High | 30 min | Error rate > 5% for 5 min (`http_requests_total{status_code=~"5.."}/total`) · P95 latency > 2× baseline for 10 min · `auth_login_failures_total` rate > 10/s for 1 min (possible brute force) | Slack `#azaion-ops` + email | +| Medium | 4 h | Host disk > 80% · `db_command_failures_total` rate > 0.1/s for 10 min · process RSS > 80% of container limit | Slack `#azaion-ops` | +| Low | Next business day | Deprecated package usage from `dotnet list package --deprecated` | Slack `#azaion-eng` | + +Baseline values (P95) come from the cycle-1 perf report: +- `/login` p95 ≈ 33 ms → high-latency alert at p95 > 66 ms for 10 min +- `/users` (500 users) p95 ≈ 152 ms → high-latency alert at p95 > 305 ms for 10 min + +Alert routing in cycle 1 is **inform-only** — no PagerDuty escalation, no auto-rollback. The deploy procedure (Step 6) documents the manual rollback path. + +## 6. Dashboards + +**Operations dashboard** (Grafana, single panel set; cycle 1): + +- Service `up` (admin-api, postgres, nginx) — stat panel +- HTTP request rate (req/s) by endpoint — time series +- HTTP error rate (% of 5xx) — time series with the High threshold band overlaid +- Latency P50 / P95 / P99 by endpoint — time series, P95 baseline reference line +- DB command rate + failure rate — time series +- Container CPU / RSS / FDs — time series (from node-exporter) +- Active alerts — table panel + +**Business dashboard** (cycle 1): + +- `users_active_total` by role — stat panel + sparkline +- `detection_classes_total` — stat panel +- `resource_upload_bytes_total` rate (1h window) — time series +- Login success/failure ratio (24h) — donut + +Dashboards stored as code in `monitoring/grafana/admin-api.json` (introduced in Step 7). + +## 7. Health Checks + +Add a `/health` Minimal API endpoint: + +| Probe | Endpoint | What it checks | Surface | +|-------|----------|----------------|---------| +| Liveness | `GET /health/live` | Process is responsive (always 200 unless the process is wedged) | Used by Docker `HEALTHCHECK` | +| Readiness | `GET /health/ready` | DB reader connection + DB admin connection (one-shot `SELECT 1` each, 2s timeout) | Used by Nginx upstream check + the deploy script (Step 6) post-deploy gate | + +Endpoints are anonymous (no JWT) but bound only to the management VLAN (or `localhost` listener) — same exposure rule as `/metrics`. + +> Failure mode: if the DB is unreachable for 30 s, `/health/ready` returns 503; Nginx pulls the upstream, returning 503 to clients (no silent traffic loss). The container itself stays running so a transient DB blip does not trigger Docker restart. + +## 8. Drifts Logged Here + +| ID | Severity | Description | Resolved In | +|----|----------|-------------|-------------| +| H | Low | `docker-compose.test.yml` health check is TCP-only; upgrade to `/health/live` once available | Step 7 | +| K | Medium (NEW) | Metrics + tracing not implemented in cycle 1; only the plan + `/health` ship | Future cycle | +| L | Low (NEW) | No central log aggregator; `journald` only | Future cycle | +| M | Low (NEW) | Tracing has no exporter (cycle 1 = trace IDs in logs only) | Future cycle when downstream services exist | + +## 9. Self-verification + +- [x] Structured JSON logging format defined with `timestamp`, `level`, `service`, `correlation_id`, `message`, `context`. +- [x] Metrics endpoint specified (`/metrics`, internal-only) with full app/system/business metric inventory. +- [x] OpenTelemetry tracing configured at the SDK level (cycle 1) with future exporter wiring (Drift M). +- [x] Alert severities with response times and channels defined; baselines tied to perf report numbers. +- [x] Dashboards defined for operations and business metrics. +- [x] PII exclusion rules cover passwords, JWTs, and email masking; refers to specific DTO field names. diff --git a/_docs/04_deploy/reports/deploy_status_report.md b/_docs/04_deploy/reports/deploy_status_report.md new file mode 100644 index 0000000..bfb83bc --- /dev/null +++ b/_docs/04_deploy/reports/deploy_status_report.md @@ -0,0 +1,122 @@ +# Azaion Admin API — Deployment Status Report + +**Date**: 2026-05-13 +**Cycle**: 1 +**Step**: Deploy / 1 — Status & Environment Setup +**Verdict**: **READY for planning** — no critical blockers; three medium drift items must be resolved before Steps 2–7 produce final artifacts. + +## Deployment Readiness Summary + +| Aspect | Status | Notes | +|--------|--------|-------| +| Architecture defined | ✅ | `_docs/02_document/architecture.md` (§3 Deployment Model) | +| Component specs complete | ✅ | 5 components in `_docs/02_document/components/` | +| Infrastructure prerequisites met | ⚠️ Partial | Self-hosted Linux + private registry assumed; SSL/DNS not codified | +| External dependencies identified | ✅ | PostgreSQL (4312) + filesystem; no message bus, no CDN consumed by API | +| Cycle-1 changes integrated | ✅ | AZ-513 (`/classes`), AZ-196 (`/devices`), AZ-197 (HW removed); AZ-183 (OTA) reverted | +| Security audit signed off | ⚠️ PASS_WITH_WARNINGS | F-2 deferred to AZ-516; F-6 (root container) carried into Step 2 (containerization) | +| Performance test signed off | ✅ | NFT-PERF-01/04 PASS; NFT-PERF-02/03 obsolete (OTA reverted) | +| Blockers | 0 | 3 medium-priority drift items, listed below | + +## Component Status + +| Component | State | Docker-ready | Notes | +|-----------|-------|--------------|-------| +| 01 Data Layer | implemented + tested | n/a (library) | linq2db 5.4.1; entities `User`, `UserConfig`, `RoleEnum`, `DetectionClass`, `ExceptionEnum` | +| 02 User Management | implemented + tested | n/a (library) | `UserService`, `RegisterUser`/`RegisterDevice` consolidated post-F-3 | +| 03 Auth & Security | implemented + tested | n/a (library) | JWT bearer + per-user resource encryption (HW component removed AZ-197) | +| 04 Resource Management | implemented + tested | n/a (library) | Filesystem-backed; OTA paths deleted post-revert | +| 05 Admin API | implemented + tested | yes | Single deployable container (`Azaion.AdminApi`); composes the four libraries | + +> Only the Admin API ships as a runtime container. Libraries are linked into the API at build time. + +## External Dependencies + +| Dependency | Type | Required For | Status | +|------------|------|--------------|--------| +| PostgreSQL 14+ (custom port 4312) | Database | All persistence | needs setup per env (`env/db/`) | +| Server filesystem (`Content/`, `logs/`) | Local I/O | Resource storage + Serilog rolling files | provisioned by host (bind mounts) | +| Docker Engine | Runtime | Container execution | required on `DEPLOY_HOST` | +| Nginx (reverse proxy) | TLS / routing | HTTPS termination, Host header | provisioned by `env/api/02-nginx-docker-registry.sh` | + +API has no outbound calls to external SaaS APIs (no SSRF surface). + +## Infrastructure Prerequisites + +| Prerequisite | Status | Action Needed | +|--------------|--------|---------------| +| Container registry | ⚠️ Two registries in flight | Drift A (below): consolidate `docker.azaion.com/api` ↔ `$REGISTRY_HOST/azaion/admin:branch-arm` | +| Cloud account | n/a | Self-hosted Linux server; no cloud account required | +| DNS configuration | ✅ | `admin.azaion.com` already in CORS allow-list | +| SSL certificates | ⚠️ Assumed at proxy | HTTPS not enforced in code (security audit F-13); document upstream chain in Step 6 | +| CI/CD platform | ✅ | Woodpecker CI on ARM64 (`.woodpecker/01-test.yml`, `02-build-push.yml`) | +| Secret manager | ❌ Not chosen | Drift B (below): Woodpecker secrets are used in CI; no manager for runtime container `.env` | +| Container user | ⚠️ root | Drift C: security audit F-6 — add `USER app` in Step 2 (containerization) | +| Health check endpoint | ❌ Missing | Step 5 (observability) — required for orchestration / load-balancer probes | + +## Deployment Drift / Blockers (planning inputs) + +| ID | Severity | Description | Resolved In | +|----|----------|-------------|-------------| +| Drift A | Medium | Image path in `deploy.cmd` / `env/api/start-container.sh` (`docker.azaion.com/api`) ≠ image path in `.woodpecker/02-build-push.yml` (`$REGISTRY_HOST/azaion/admin:branch-arm`). The host pulls `:latest`, the CI never pushes `:latest`. | Step 3 (CI/CD) + Step 7 (scripts) | +| Drift B | Medium | Production `.env` is hand-edited on the server. No secret manager, no rotation policy. | Step 4 (env strategy) — propose Vault / sops / SSM | +| Drift C | Medium | `Dockerfile` final stage runs as root (security audit F-6, AZ-518). | Step 2 (containerization) — add non-root `USER app` (UID 1654) | +| Drift D | Low | `.woodpecker/build-arm.yml` referenced in old docs but the actual files are `01-test.yml` + `02-build-push.yml`. | Step 3 (CI/CD) — refresh the doc | +| Drift E | Low | Performance script is run-on-demand (`scripts/run-performance-tests.sh`), not gated in CI. | Step 3 (CI/CD) — optional perf gate | +| Drift F | Low | No vulnerable-dep gate in CI (security audit recommendation 13). | Step 3 (CI/CD) — `dotnet list package --vulnerable` | + +> No **critical** blockers. The drifts are planning inputs for Steps 2–7; they do NOT block Step 1 from completing. + +## Required Environment Variables + +| Variable | Purpose | Required In | Default (Dev) | Source (Staging/Prod) | +|----------|---------|-------------|---------------|----------------------| +| `ASPNETCORE_ENVIRONMENT` | Selects appsettings overlay + Swagger gate | All | `Development` | Environment (`Production`) | +| `ASPNETCORE_URLS` | Kestrel bind address | Container | `http://+:8080` | Environment | +| `ASPNETCORE_ConnectionStrings__AzaionDb` | Reader DB connection (read-only role) | All | `Host=localhost;Port=4312;…;Username=azaion_reader` | Secret manager | +| `ASPNETCORE_ConnectionStrings__AzaionDbAdmin` | Admin DB connection (read/write role) | All | `Host=localhost;Port=4312;…;Username=azaion_admin` | Secret manager | +| `ASPNETCORE_JwtConfig__Secret` | HMAC-SHA256 signing key (≥ 32 bytes) | All | dev-only literal in `.env` | Secret manager | +| `ASPNETCORE_JwtConfig__Issuer` | JWT `iss` claim | All | `AzaionApi` (appsettings) | appsettings or env override | +| `ASPNETCORE_JwtConfig__Audience` | JWT `aud` claim | All | `Annotators/OrangePi/Admins` (appsettings) | appsettings or env override | +| `ASPNETCORE_JwtConfig__TokenLifetimeHours` | Token TTL | All | `4` (appsettings) | Environment | +| `ASPNETCORE_ResourcesConfig__ResourcesFolder` | File storage root | All | `Content` | Environment | +| `ASPNETCORE_ResourcesConfig__SuiteInstallerFolder` | Prod installer dir | All | `suite` | Environment | +| `ASPNETCORE_ResourcesConfig__SuiteStageInstallerFolder` | Stage installer dir | All | `suite-stage` | Environment | +| `CI_COMMIT_SHA` | Build-time label → `AZAION_REVISION` env in container | Build only | (unset → `unknown`) | Woodpecker `$CI_COMMIT_SHA` | +| `DEPLOY_HOST` | Remote target machine for `scripts/deploy.sh` | Deploy scripts | `admin.azaion.com` | Environment | +| `DEPLOY_SSH_USER` | SSH user on `DEPLOY_HOST` | Deploy scripts | `root` | Environment | +| `DEPLOY_CONTAINER_NAME` | Docker container name on host | Deploy scripts | `azaion.api` | Environment | +| `DEPLOY_HOST_PORT` | Published host port (mapped to container 8080) | Deploy scripts | `4000` | Environment | +| `DEPLOY_HOST_CONTENT_DIR` | Host bind mount for `Content/` | Deploy scripts | `/root/api/content` | Environment | +| `DEPLOY_HOST_LOGS_DIR` | Host bind mount for `logs/` | Deploy scripts | `/root/api/logs` | Environment | +| `REGISTRY_HOST` | Container registry hostname | CI + deploy scripts | `docker.azaion.com` | Environment / Woodpecker secret | +| `REGISTRY_IMAGE` | Image path inside registry | CI + deploy scripts | `azaion/admin` | Environment | +| `REGISTRY_TAG` | Image tag | Deploy scripts | `dev-arm` | Environment | +| `REGISTRY_USER` | Registry login user | CI + deploy scripts | (empty) | Woodpecker secret `registry_user` / Secret manager | +| `REGISTRY_TOKEN` | Registry login token/password | CI + deploy scripts | (empty) | Woodpecker secret `registry_token` / Secret manager | + +> All `ASPNETCORE_…` variables map to ASP.NET Core's `IConfiguration` via the standard `__` separator (e.g., `JwtConfig:Secret` ← `ASPNETCORE_JwtConfig__Secret`). The `ASPNETCORE_` prefix is *required* — `ConfigurationBuilder` only picks up env vars under that prefix unless additional prefixes are wired explicitly (which this app does not do). + +## .env Files Created + +- `.env.example` — committed to VCS, contains all variable names with placeholder values and inline comments. +- `.env` — git-ignored (via existing `.gitignore` line `.env`), contains development defaults pointing at the local Postgres on port 4312 and a clearly-marked dev-only JWT secret. + +## Acceptance Checklist (Step 1 self-verification) + +- [x] All five components assessed for deployment readiness. +- [x] External dependencies catalogued (Postgres + filesystem only). +- [x] Infrastructure prerequisites identified, including 6 named drifts (A–F). +- [x] All required environment variables discovered (24 entries). +- [x] `.env.example` created with placeholders + comments. +- [x] `.env` created with safe local defaults (no real secrets). +- [x] `.gitignore` already excludes `.env` (line 10). +- [x] Status report written to `_docs/04_deploy/reports/deploy_status_report.md`. + +## Next Steps + +1. **User confirms** this report (BLOCKING gate at end of Step 1). +2. Step 2 (Containerization): consume Drift C (non-root `USER`) and the existing multi-stage Dockerfile as the baseline. +3. Step 3 (CI/CD): consume Drifts A, D, E, F and refresh the documented pipeline against the actual `01-test.yml` / `02-build-push.yml` files. +4. Step 4 (Environment Strategy): consume Drift B by proposing a secret manager option (e.g., HashiCorp Vault, sops-encrypted files in git, or Woodpecker secrets exported into the runtime container). +5. Steps 5–7 then layer observability, procedures, and scripts on top. diff --git a/_docs/05_security/dependency_scan.md b/_docs/05_security/dependency_scan.md new file mode 100644 index 0000000..1336d2c --- /dev/null +++ b/_docs/05_security/dependency_scan.md @@ -0,0 +1,58 @@ +# Dependency Scan + +**Date**: 2026-05-13 +**Scanner**: `dotnet list package --vulnerable --include-transitive` + `--deprecated` (NuGet metadata) plus manual cross-reference of pinned versions against published GitHub Security Advisories (GHSA). +**Sources used**: `api.nuget.org`, three private `pkgs.dev.azure.com/pwc-us-prism/_packaging/*` feeds. + +## Scope + +| Project | Vulnerable Packages | +|---------|---------------------| +| `Azaion.AdminApi` | none reported | +| `Azaion.Common` | none reported | +| `Azaion.Services` | none reported | +| `Azaion.Test` | none reported | +| `e2e/Azaion.E2E` | none reported | + +`dotnet list package --vulnerable --include-transitive` returned a clean result for every project against the configured feeds. No CVE-ranked findings. + +## Deprecated Packages + +| Project | Package | Version | Reason | Recommended | +|---------|---------|---------|--------|-------------| +| `Azaion.AdminApi` | `FluentValidation.AspNetCore` | 11.3.0 | Legacy (deprecated by maintainer) | Move validators to manual `ServiceCollectionExtensions.AddValidatorsFromAssembly(...)` registration; `FluentValidation` 11.10.0 (already in use elsewhere) is the supported core. The AspNetCore auto-DI helper is no longer maintained. | +| `Azaion.Services` | `System.IdentityModel.Tokens.Jwt` | 7.1.2 | Legacy (Microsoft pushes consumers to `Microsoft.IdentityModel.JsonWebTokens`) | Migrate to `Microsoft.IdentityModel.JsonWebTokens` (the modern token-handler stack already shipped via `Microsoft.AspNetCore.Authentication.JwtBearer 10.0.3`). | +| `Azaion.Test` | `xunit` | 2.9.2 | Legacy (`xunit.v3` is the new line) | Plan a migration to `xunit.v3` once it leaves prerelease. Not urgent — `xunit 2.x` still receives security backports. | + +Deprecated ≠ vulnerable. None of the three packages above carry an open CVE. They are flagged so we have a paper trail before they reach end-of-life. + +## Manual Advisory Cross-Reference + +The pinned top-level package list (output of `dotnet list package`) was cross-checked against GitHub Security Advisories for known issues NOT yet surfaced by NuGet metadata: + +| Package | Pinned | Advisory | Severity | Fix Version | Notes | +|---------|--------|----------|----------|-------------|-------| +| `Newtonsoft.Json` | **13.0.1** | GHSA-5crp-9r3c-p9vr (Improper Handling of Exceptional Conditions — DoS via deeply nested JSON) | **High** | **13.0.2 or higher** | Used transitively + directly across `Azaion.Common`, `Azaion.Services`. Untrusted JSON enters via `LoginRequest`, `RegisterUserRequest`, `GetUpdateRequest`, etc. — all of which deserialize via the ASP.NET Core minimal API stack. Even though minimal API uses `System.Text.Json` by default, the `Newtonsoft.Json` reference is reachable from logging payload formatting and from `ResourceColumnEncryption`-adjacent code paths. **Bump to 13.0.3 or later.** | +| `LazyCache.AspNetCore` | 2.4.0 | none open | — | — | Last release 2022; in maintenance mode. No advisory. | +| `Microsoft.AspNetCore.Authentication.JwtBearer` | 10.0.3 | none open | — | — | Latest .NET 10 line. | +| `Npgsql` | 10.0.1 | none open | — | — | Current. | +| `linq2db` | 5.4.1 | none open | — | — | Current. | +| `Swashbuckle.AspNetCore` | 10.1.4 | none open | — | — | Current. | +| `Serilog` family (`4.1.0` / sinks `6.0.0` / `8.0.0`) | varies | none open | — | — | Current. | +| `FluentAssertions` | 6.12.2 | n/a (test-only) | — | — | License changed in 8.0; staying on 6.x is fine. | + +## Findings + +### D-1: `Newtonsoft.Json 13.0.1` is below the patched line for GHSA-5crp-9r3c-p9vr (High) — **RESOLVED in cycle 1** + +- **Severity**: High (now closed) +- **CVE/Advisory**: GHSA-5crp-9r3c-p9vr (DoS via uncontrolled recursion when deserializing deeply nested JSON) +- **Location at time of finding**: top-level reference in `Azaion.Common.csproj`, `Azaion.Services.csproj` +- **Resolution (2026-05-13)**: bumped to **13.0.4** (current stable, released 2025-09-17) in both csproj files. `dotnet restore` + `dotnet build` succeeded. Full test suite re-ran clean: 48 e2e (Docker) + 2 unit. The 13.0.1 → 13.0.4 jump is patch-level on the same major; `JsonConvert.SerializeObject` / `DeserializeObject` API surface unchanged at the call sites (`AzaionDbSchemaHolder`, `BusinessExceptionHandler`, `SecurityTest`). +- **Notes**: NuGet's `--vulnerable` did not flag this on the configured feeds — likely because the GHSA → NuGet vulnerability index sync depends on advisory enrichment that hasn't propagated to all mirrors. Manual upgrade was warranted. + +## Self-verification + +- [x] All package manifests scanned (5 csproj, 4 production + 1 e2e) +- [x] Each finding has a CVE/advisory reference +- [x] Upgrade paths identified for High findings diff --git a/_docs/05_security/infrastructure_review.md b/_docs/05_security/infrastructure_review.md new file mode 100644 index 0000000..c9a7467 --- /dev/null +++ b/_docs/05_security/infrastructure_review.md @@ -0,0 +1,102 @@ +# Infrastructure & Configuration Review + +**Date**: 2026-05-13 +**Scope**: `Dockerfile`, `docker.test/Dockerfile`, `e2e/Dockerfile`, `docker-compose.test.yml`, `appsettings*.json`, `env/db/*.sql`, `.gitignore`, `.dockerignore`. No CI/CD pipeline files (`.github/workflows/`, `.gitlab-ci.yml`, `azure-pipelines.yml`) are present in this workspace. + +## Container Security + +### Production image (`Dockerfile`) + +| Check | Result | Notes | +|-------|--------|-------| +| Non-root user | **FAIL** | No `USER` directive — runs as root. See F-6. | +| Minimal base image | PASS | `mcr.microsoft.com/dotnet/aspnet:10.0` (runtime-only) for the final stage; SDK image used only for build. | +| Multi-stage build | PASS | Build / publish / runtime stages cleanly separated. | +| No secrets in build args | PASS | Only `CI_COMMIT_SHA` (passed as `AZAION_REVISION` env at runtime) — non-sensitive. | +| Health check defined | PASS (compose layer) | `docker-compose.test.yml:42-51` defines a TCP health check; the production deployment must define an equivalent. **Not verified in this audit** because no production compose file exists in this workspace. | +| Image pinned by digest | **WARN** | Base images use `:10.0` tag, not `@sha256:...` digest. Tag floats — a poisoned upstream tag would be picked up on the next rebuild. Acceptable if the build runs from a controlled cache; otherwise pin. | +| `EXPOSE` matches Kestrel binding | PASS | `EXPOSE 8080`, `ASPNETCORE_URLS=http://+:8080`. | + +### Test sidecar image (`docker.test/Dockerfile`) + +``` +FROM alpine:latest +CMD echo hello +``` + +This image is essentially a no-op stub. It's referenced from somewhere in the test/CI tooling but contributes nothing functional. **Recommendation**: either remove it (if nothing references it) or document its purpose. From a security standpoint it's inert — `alpine:latest` is fine for an `echo` and the floating tag is irrelevant for a stub. Flag as **operational hygiene, not a security finding**. + +### E2E runner image (`e2e/Dockerfile`) + +| Check | Result | Notes | +|-------|--------|-------| +| Non-root | **FAIL** (test-only) | Same root-by-default behavior. Test runner only — no exposed network, runs in an ephemeral container per CI run. **Acceptable.** | +| Uses SDK image as final | **WARN** (test-only) | Final stage is `mcr.microsoft.com/dotnet/sdk:10.0` — needed for `dotnet test`. SDK images carry more attack surface than runtime images, but this container is only reachable on the internal `e2e-net` bridge. **Acceptable for test-only.** | + +## docker-compose.test.yml + +| Check | Result | Notes | +|-------|--------|-------| +| Secrets via env vars (not committed prod secrets) | PASS (test-only) | The DB password, JWT secret, and master key are committed because this is the e2e harness only. F-10 captures the rule: these literals must never appear in a production compose file. | +| No port leaks | PASS | Only `8080:8080` is published from `system-under-test`; `test-db` is internal to the bridge. | +| Healthcheck for the API | PASS | TCP probe on `127.0.0.1:8080`. | +| Network isolation | PASS | All three services share `e2e-net` only; no `host` network mode. | +| Image lock | **WARN** (test-only) | `postgres:16-alpine` floats. For test reproducibility consider pinning a digest, but not security-critical for an ephemeral test stack. | + +There is **no production `docker-compose.yml`** in this workspace. Any production deployment must: +- Inject `JwtConfig__Secret` and both `ConnectionStrings__*` values from a secret manager (Vault, AWS Secrets Manager, Azure Key Vault). (`ResourcesConfig__EncryptionMasterKey` was the third item here pre-revert; that field has since been deleted along with the OTA feature.) +- NOT carry over the `ASPNETCORE_ENVIRONMENT=Development` value used in the test compose — that environment value enables Swagger at the root path. + +## Environment Configuration + +### `Azaion.AdminApi/appsettings.json` + +| Field | Value committed | Risk | +|-------|----------------|------| +| `Logging.LogLevel.*` | `Information` / `Warning` | OK | +| `AllowedHosts` | `"*"` | OK at the framework level — host filtering is normally enforced upstream by the reverse proxy. | +| `ResourcesConfig.ResourcesFolder` | `"Content"` | Relative path; resolves under the working directory inside the container. OK. | +| ~~`ResourcesConfig.EncryptionMasterKey`~~ | — | **Removed in the post-cycle-1 revert** along with the OTA feature; field no longer exists in `ResourcesConfig`, `appsettings.json`, or `docker-compose.test.yml`. Closes F-5 automatically. | +| `JwtConfig.{Issuer, Audience, TokenLifetimeHours}` | committed | Public-by-design (not secrets). | +| `JwtConfig.Secret` | **NOT committed** | Correct. Supplied via env var. | +| `ConnectionStrings.*` | **NOT committed** | Correct. Supplied via env var. | + +### `Azaion.AdminApi/appsettings.Development.json` + +Empty except for log levels — fine. + +### `e2e/Azaion.E2E/appsettings.test.json` + +Test-only. F-10 captures this; not a production concern. + +## Secrets Hygiene + +| Pattern | Where | Disposition | +|---------|-------|-------------| +| Plaintext DB passwords | `env/db/01_permissions.sql` | F-11 — operator template, not runtime. Add a header comment. | +| Plaintext DB / JWT / master-key in `docker-compose.test.yml` | `docker-compose.test.yml:31-37` | F-10 — test-only. CI guard recommended. | +| Plaintext admin/uploader passwords in `e2e/Azaion.E2E/appsettings.test.json` | as above | F-10 — test-only. | +| `.env` ignored | `.gitignore:10` | PASS | +| `bin/`, `obj/`, `logs/`, `Content/` ignored | `.gitignore:2-3, 7, 9` | PASS — keeps build artifacts and runtime data out of git. | + +## CI/CD + +No CI configuration files were found in the workspace (`.github/workflows/`, `.gitlab-ci.yml`, `azure-pipelines.yml`, `Jenkinsfile` — none present). Either: +- CI is configured outside this repo (e.g., upstream meta-repo), in which case the security guardrails (`dotnet list package --vulnerable`, secret-scanning, dependency-review) need to be verified there. +- Or there is no automated CI today, in which case this is a meta-finding: the dependency-bump (D-1) and the test suite have no automated gate. **Recommend**: introduce at least a `dotnet build && dotnet test && dotnet list package --vulnerable` job before deploy. + +## Network Security + +| Check | Status | Notes | +|-------|--------|-------| +| HTTPS enforcement in code | **FAIL** | F-13 — assumed at reverse proxy. | +| HSTS | not configured | Acceptable if reverse proxy injects it. | +| CORS | tight | `AdminCorsPolicy` whitelist-only (`https://admin.azaion.com`, `http://admin.azaion.com`). The plaintext `http://` origin can be removed once the SaaS UI is HTTPS-only. | +| Security headers | not configured in app | `X-Frame-Options`, `X-Content-Type-Options`, `Content-Security-Policy` not set in code. Reverse proxy responsibility today; document the assumption. | + +## Self-verification + +- [x] All Dockerfiles reviewed (`Dockerfile`, `docker.test/Dockerfile`, `e2e/Dockerfile`) +- [x] All compose files reviewed (`docker-compose.test.yml` — only one in repo) +- [x] All environment / config files reviewed (3 `appsettings*.json`) +- [x] CI/CD reviewed (none present in repo — surfaced as meta-finding) diff --git a/_docs/05_security/owasp_review.md b/_docs/05_security/owasp_review.md new file mode 100644 index 0000000..68a4717 --- /dev/null +++ b/_docs/05_security/owasp_review.md @@ -0,0 +1,50 @@ +# OWASP Top 10 Review (2021 edition) + +**Date**: 2026-05-13 +**Framework**: [OWASP Top 10 — 2021](https://owasp.org/www-project-top-ten/) (the 2025 release is not yet finalized as of this audit; 2021 remains the current authoritative list). +**Scope cross-reference**: every FAIL below cites a Phase 2 finding ID (`F-N`) for the underlying evidence. + +## Per-Category Assessment + +| # | Category | Status | Findings | +|---|----------|--------|----------| +| A01 | Broken Access Control | **FAIL** | F-2 (path traversal via `dataFolder`) — F-1 closed via OTA feature revert | +| A02 | Cryptographic Failures | PASS_WITH_WARNINGS | F-1 closed via revert; D-1 closed via Newtonsoft bump (13.0.4); F-7 (SHA-384 password hash, no salt/KDF) remains open as a hardening item | +| A03 | Injection | **PASS** | linq2db parameterizes all queries; no string-concatenation SQL paths found in `Azaion.Services/*Service.cs` or `Azaion.Common/Database/*`. No `Process.Start` / `subprocess` usage in production code. No template injection paths. | +| A04 | Insecure Design | PASS_WITH_WARNINGS | F-3 closed (UNIQUE INDEX `users_email_uidx` + `RegisterUser`/`RegisterDevice` consolidation); F-8 (no rate limiting on `/login`) remains as a hardening item | +| A05 | Security Misconfiguration | **FAIL** | F-6 (container runs as root), F-13 (no HTTPS enforcement in code), F-9 (request DTOs missing validators), F-11 (placeholder credentials in `01_permissions.sql`). F-5 closed automatically (`EncryptionMasterKey` field deleted with the OTA revert). | +| A06 | Vulnerable & Outdated Components | **PASS** | All `dotnet list package --vulnerable` checks return clean. D-1 (Newtonsoft.Json) was the only manual finding; closed in this audit by bumping to 13.0.4. Three deprecated-but-not-vulnerable packages noted in `dependency_scan.md`. | +| A07 | Identification & Authentication Failures | PASS_WITH_WARNINGS | F-3 closed (DB UNIQUE INDEX now enforces one-row-per-email). F-7 (weak password hashing) and F-8 (no rate limiting) remain open as hardening items. | +| A08 | Software & Data Integrity Failures | **PASS** | OTA flow that introduced the unsigned-manifest concern was reverted. CI/CD: secrets are env-injected, no in-repo secrets in `Dockerfile` / compose files used by prod. | +| A09 | Security Logging & Monitoring Failures | **PASS_WITH_WARNINGS** | Serilog console + rolling file sink configured. F-12 (one unstructured log line in `ResourcesService`). No security-event-specific logger — login successes/failures, role changes, deletes are not separately auditable. | +| A10 | Server-Side Request Forgery (SSRF) | **NOT_APPLICABLE** | The API never makes outbound HTTP calls based on user-controlled URLs. `CdnUrl` from `PublishResourceRequest` is stored and forwarded but never fetched server-side. | + +## Cross-Reference Against `security_approach.md` + +The pre-cycle-1 `security_approach.md` "Known Security Observations" list is reconciled here: + +| Original observation | Status post-cycle-1 | +|----------------------|---------------------| +| 1. SHA-384 without per-user salt | **Still open** — F-7 | +| 2. `hardware_hash` DB column unused | **Resolved by AZ-197** — column-level removal pending follow-up; field is now dead but the column is still in the schema (`02_structure.sql:9`). Not a security risk; cleanup task. | +| 3. No path traversal protection on `dataFolder` | **Still open** — F-2 | +| 4. Hardcoded DB credentials in test files | **Confirmed test-only** — F-10 | +| 5. No rate limiting on `/login` | **Still open** — F-8 | +| 6. No audit trail for security-relevant operations | **Still open** — A09 PASS_WITH_WARNINGS | +| 7. No HTTPS enforcement in code | **Still open** — F-13 | +| 8. Static encryption key salts hardcoded | **Partially resolved** — `Security.GetApiEncryptionKey` salt is still hardcoded but the AZ-197 removal of the `hwHash` component reduced surface area. (`ResourceColumnEncryption` was deleted along with the OTA revert.) | + +## Cycle-1 Specific Verdict + +The cycle-1 changes (AZ-513, AZ-196, AZ-183, AZ-197) introduced one new High-severity finding (F-1, on `/get-update`) and amplified one existing High (F-3, via `RegisterDevice`). Both were closed before any deploy: + +- **F-1 (resolved by feature revert)**: AZ-183 was reverted in full; the OTA delivery model itself is obsolete in the target architecture. +- **F-3 (resolved)**: `RegisterDevice` now delegates to `RegisterUser`; `users.email` has a UNIQUE INDEX (`users_email_uidx`); UNIQUE-violation is translated to `EmailExists`. + +Other cycle-1 endpoints (`/devices`, `/classes/*`) have correct authorization wiring (`apiAdminPolicy`). + +## Self-verification + +- [x] All current OWASP Top 10 categories assessed +- [x] Each FAIL has at least one specific finding with evidence (F-N reference) +- [x] NOT_APPLICABLE category has justification (A10) diff --git a/_docs/05_security/security_report.md b/_docs/05_security/security_report.md new file mode 100644 index 0000000..c086cbe --- /dev/null +++ b/_docs/05_security/security_report.md @@ -0,0 +1,120 @@ +# Security Audit Report + +**Date**: 2026-05-13 +**Scope**: Azaion Admin API workspace (`/Users/obezdienie001/dev/azaion/suite/admin`) — full audit triggered by autodev cycle 1 completion (AZ-513, AZ-196, AZ-183, AZ-197). +**Verdict**: **PASS_WITH_WARNINGS** (Critical: 0; High: 1 open — F-2 path traversal pre-existing; deferred to a separate ticket. F-1, F-3, D-1 closed in this audit.) + +## Summary + +| Severity | Count | Closed in this audit | +|----------|-------|----------------------| +| Critical | 0 | 0 | +| High | 3 | 3 — D-1 (`Newtonsoft.Json` 13.0.1 → 13.0.4), F-1 (OTA feature reverted — endpoints, service, entity, table, request DTOs, response DTO, e2e tests, master-key config field all deleted), F-3 (`RegisterDevice` reuses `RegisterUser`; UNIQUE INDEX `users_email_uidx` added) | +| Medium | 5 | 0 | +| Low | 5 | 0 | + +## Open / Deferred + +- **F-2** (path traversal via `dataFolder` route segment) remains open. It is pre-existing and unrelated to cycle-1 changes. Recommended: file as a separate ticket and close in a focused refactor (the fix needs both validation and a deployment-time review of `Content/` permissions). Not a blocker on this cycle. + +## OWASP Top 10 (2021) Assessment + +| Category | Status | Findings | +|----------|--------|----------| +| A01 Broken Access Control | **FAIL** | F-2 (open) — F-1 closed via OTA feature revert | +| A02 Cryptographic Failures | PASS_WITH_WARNINGS | F-1 closed via revert; D-1 closed via Newtonsoft bump; F-7 (open, hardening) | +| A03 Injection | PASS | — | +| A04 Insecure Design | PASS_WITH_WARNINGS | F-3 closed; F-8 (open, hardening) | +| A05 Security Misconfiguration | **FAIL** | F-5, F-6, F-13, F-9, F-11 | +| A06 Vulnerable Components | PASS | D-1 closed; 3 deprecated-but-not-vulnerable packages logged | +| A07 Auth Failures | PASS_WITH_WARNINGS | F-3 closed; F-7 + F-8 (open, hardening) | +| A08 Data Integrity Failures | PASS | OTA flow that introduced the unsigned-manifest concern was reverted | +| A09 Logging Failures | PASS_WITH_WARNINGS | F-12; no separate security audit log | +| A10 SSRF | NOT_APPLICABLE | API makes no outbound calls based on user URLs | + +## Findings (severity-ranked) + +| # | Severity | Category | Location | Title | +|---|----------|----------|----------|-------| +| F-1 | **High** (CLOSED via revert) | A01 / A02 | (deleted) `Program.cs` `/get-update`, `ResourceUpdateService.cs` | `/get-update` exposed plaintext `EncryptionKey` to any authenticated caller. **Resolution**: the entire OTA feature was deleted — endpoints, `IResourceUpdateService`, `ResourceColumnEncryption`, `Resource` entity, `resources` table, `EncryptionMasterKey` config field, `apiUploaderPolicy`, `ResourceUpdateTests.cs`. AZ-183 is reverted; the OTA delivery model itself is obsolete in the target architecture. | +| F-2 | **High** (open) | A01 | `ResourcesService.cs:20-25` + `Program.cs:201,213,219,224` | Path traversal via `dataFolder` route segment (pre-existing) — deferred to a separate ticket | +| F-3 | **High** (CLOSED) | A04 / A07 | `env/db/06_users_email_unique.sql`, `UserService.cs` | `users.email` lacked UNIQUE → duplicate-row race in `RegisterUser` and `RegisterDevice`. **Resolution**: added migration `env/db/06_users_email_unique.sql` (`CREATE UNIQUE INDEX users_email_uidx ON public.users (email)`); refactored `RegisterUser` to drop the check-then-insert pattern and translate `Npgsql.PostgresException(SqlState=23505)` to `BusinessException(EmailExists)`; refactored `RegisterDevice` to delegate the row insert to `RegisterUser`. | +| D-1 | **High** (CLOSED) | A06 | `Azaion.Common.csproj`, `Azaion.Services.csproj` | `Newtonsoft.Json 13.0.1` < patched line for GHSA-5crp-9r3c-p9vr — **bumped to 13.0.4** | +| F-4 | Medium | A02 | `Program.cs:158-162` | `/devices` returns plaintext device password (accepted by design — needs `Cache-Control: no-store` and Swagger trim) | +| F-5 | Medium | A05 | `ResourceUpdateService.cs:86-97` | `EncryptionMasterKey` validated lazily on first call instead of at startup | +| F-6 | Medium | A05 | `Dockerfile:1,20-25` | API container runs as root | +| F-7 | Medium | A02 / A07 | `Security.cs:11-12` | SHA-384 password hashing without per-user salt or KDF (pre-existing) | +| F-8 | Medium | A04 / A07 | `Program.cs:137-143` | No rate limiting on `/login` (pre-existing) | +| F-9 | Low | A05 | `LoginRequest.cs`, `SetUserQueueOffsetsRequest.cs` | DTOs lack `AbstractValidator` | +| F-10 | Low | — | `docker-compose.test.yml`, `appsettings.test.json` | Hardcoded credentials/JWT secret in test fixtures (test-only, accepted) | +| F-11 | Low | A05 | `env/db/01_permissions.sql:2,7,12` | Placeholder DB passwords as setup template — needs header comment | +| F-12 | Low | A09 | `ResourcesService.cs:63` | Unstructured `LogInformation($"...")` defeats Serilog property capture | +| F-13 | Low | A02 | `Program.cs` | No HTTPS enforcement / HSTS in code (assumed at reverse proxy) | + +Detailed evidence and remediation steps for each finding are in `static_analysis.md` (F-N) and `dependency_scan.md` (D-N). + +## Cycle-1 Specific Verdict + +The four cycle-1 tasks introduced one **new** High finding (F-1) and amplified one pre-existing High (F-3). Both were closed before deploy: + +- **F-1 (resolved by revert)**: `/get-update` and `/resources/publish` are deleted; the entire OTA feature (AZ-183) is reverted. The user assessment was that the feature is itself a leftover from the installer-shipping era and is no longer needed in the target architecture (browser-only SaaS + fTPM-secured Jetsons). +- **F-3 (resolved)**: `RegisterDevice` now delegates to `RegisterUser`; `users.email` has a UNIQUE INDEX; `RegisterUser` translates UNIQUE-violation to `EmailExists`. The duplicate-row race is closed atomically. +- AZ-513 (`/classes` CRUD) and AZ-197 (hardware removal) introduce no new security findings. AZ-197 closes the prior `Hardware Fingerprint Binding` section of `security_approach.md` — the corresponding code, error codes, and DTOs are all gone. + +## Dependency Vulnerabilities + +| Package | CVE / Advisory | Severity | Status | +|---------|---------------|----------|--------| +| `Newtonsoft.Json` 13.0.1 → 13.0.4 | GHSA-5crp-9r3c-p9vr (DoS via deeply nested JSON) | High | **Closed in this audit** | + +`dotnet list package --vulnerable` returns clean for all five projects against the configured NuGet feeds. Three deprecated-but-not-vulnerable packages are tracked as forward-looking hygiene items in `dependency_scan.md` (`FluentValidation.AspNetCore` 11.3.0, `System.IdentityModel.Tokens.Jwt` 7.1.2, `xunit` 2.x). + +## Recommendations + +### Closed in this audit +- **F-1**: OTA feature reverted in full (see "Findings" above). +- **F-3**: UNIQUE INDEX added; `RegisterUser`/`RegisterDevice` consolidated; UNIQUE-violation translated to `EmailExists`. +- **D-1**: `Newtonsoft.Json` bumped to 13.0.4. + +### Immediate (Critical / High — open) + +1. **F-2** (deferred): Add `dataFolder` validation (`[A-Za-z0-9_-]+` only, plus `Path.GetFullPath(combined).StartsWith(...)` post-check) in `ResourcesService.GetResourceFolder`. Pre-existing finding; file as a separate ticket. + +### Short-term (Medium) + +4. **F-4**: Add `Cache-Control: no-store, no-cache` to the `/devices` response and document the operator runbook entry (no body logging at the reverse proxy). +5. **F-5**: Validate `ResourcesConfig.EncryptionMasterKey` at startup in non-Development environments. +6. **F-6**: Add `USER app` to the final stage of `Dockerfile`; verify `Content/` and `logs/` are writable to `app` (UID 1654). +7. **F-7**: Migrate password hashing to Argon2id with per-user salt (rolling rehash on next login). +8. **F-8**: Enable ASP.NET Core 10 rate limiter on `/login` (e.g., 10 req/IP/min). + +### Long-term (Low / hardening) + +9. **F-9**: Add validators for `LoginRequest`, `SetUserQueueOffsetsRequest`. +10. **F-11**: Add a header comment to `env/db/01_permissions.sql` flagging it as a template, or rename to `*.example.sql`. +11. **F-12**: Convert the one unstructured log line in `ResourcesService.SaveResource` to structured form. +12. **F-13**: Document the upstream HTTPS / HSTS / security-header chain in a deployment runbook; consider `app.UseHsts()` once the chain is documented. +13. **CI gate**: introduce a build that runs `dotnet build && dotnet test && dotnet list package --vulnerable` and fails the pipeline on any vulnerability finding. + +## Verdict Logic + +- **PASS_WITH_WARNINGS** because exactly one pre-existing **High** finding remains open (F-2 path traversal). The two cycle-1-attributable Highs (F-1 regression, F-3 amplified) and the dependency High (D-1) were all closed during the audit. + +## Tracker Follow-Ups + +The following items should be filed as separate Jira tasks in the AZ project (per `.cursor/rules/tracker.mdc`). Do not write to the tracker as part of this audit — surface them to the user for prioritization first. + +| Ticket | Title | Points | +|--------|-------|--------| +| [AZ-516](https://denyspopov.atlassian.net/browse/AZ-516) | F-2: Sanitize `dataFolder` route segment to prevent path traversal | 3 | +| [AZ-517](https://denyspopov.atlassian.net/browse/AZ-517) | F-4: Harden `/devices` response (Cache-Control, runbook) | 2 | +| [AZ-518](https://denyspopov.atlassian.net/browse/AZ-518) | F-6: Run admin API container as non-root | 2 | +| [AZ-519](https://denyspopov.atlassian.net/browse/AZ-519) | F-7: Migrate password hashing to Argon2id with per-user salt | 5 | +| [AZ-520](https://denyspopov.atlassian.net/browse/AZ-520) | F-8: Add rate limiting to `/login` endpoint | 2 | +| [AZ-521](https://denyspopov.atlassian.net/browse/AZ-521) | Low-severity security hygiene bundle (F-9, F-11, F-12, F-13) | 3 | + +**Closed in cycle 1 (no ticket needed)**: +- F-1 — OTA feature deleted end-to-end (AZ-183 reverted). +- F-3 — UNIQUE INDEX migration `env/db/06_users_email_unique.sql` + `RegisterUser`/`RegisterDevice` consolidation. +- D-1 — `Newtonsoft.Json` bumped to 13.0.4. +- F-5 — `EncryptionMasterKey` config field deleted along with the OTA feature; the lazy-validation surface no longer exists. diff --git a/_docs/05_security/static_analysis.md b/_docs/05_security/static_analysis.md new file mode 100644 index 0000000..a5a178e --- /dev/null +++ b/_docs/05_security/static_analysis.md @@ -0,0 +1,146 @@ +# Static Analysis (SAST) + +**Date**: 2026-05-13 +**Method**: targeted code review of the cycle-1 surface (`/devices`, `/classes` CRUD, `/get-update`, `/resources/publish`) plus regression sweep over the pre-existing endpoints `/login`, `/users/*`, `/resources/*`. No automated SAST tool was run — all findings are manually identified with file/line evidence. + +**Post-audit status (2026-05-13)**: F-1 closed via revert of AZ-183 (OTA feature deleted end-to-end); F-3 closed via UNIQUE INDEX `users_email_uidx` + `RegisterUser`/`RegisterDevice` consolidation; F-5 closed automatically as a consequence of F-1; D-1 closed via `Newtonsoft.Json` 13.0.4 bump. F-2 (path traversal) remains open — pre-existing, deferred to a separate ticket. + +## Findings + +### F-1: `/get-update` exposes per-resource plaintext `EncryptionKey` to any authenticated caller (HIGH — cycle-1 regression — **CLOSED via revert**) + +- **Severity**: High +- **Status**: **CLOSED** (2026-05-13) — the entire OTA feature was deleted: `/get-update` and `/resources/publish` endpoints, `IResourceUpdateService` / `ResourceUpdateService` / `ResourceColumnEncryption`, the `Resource` entity, the `resources` table, the `apiUploaderPolicy` policy, the `ResourcesConfig.EncryptionMasterKey` config field, and the `e2e/Azaion.E2E/Tests/ResourceUpdateTests.cs` test class. AZ-183 is reverted; the OTA delivery model is itself obsolete in the target architecture (browser-only SaaS + fTPM-secured Jetsons). +- **Category**: Broken Access Control / Cryptographic Failures +- **Original locations** (now deleted): + - `Azaion.AdminApi/Program.cs:301-312` — endpoint registration used `.RequireAuthorization()` (any logged-in user). + - `Azaion.Services/ResourceUpdateService.cs:39-48` — every `ResourceUpdateItem` returned to the caller contained `EncryptionKey = ResourceColumnEncryption.Decrypt(resource.EncryptionKey, MasterKey)`. +- **Description**: The `resources.encryption_key` column was encrypted at rest with `ResourcesConfig.EncryptionMasterKey` precisely to mitigate DB compromise. But the application API decrypted and serialized that key into the HTTP response for any caller holding a valid JWT. A low-privilege user could submit `POST /get-update {Architecture, DevStage, CurrentVersions: {}}` and receive — for every published resource — `(CdnUrl, Sha256, EncryptionKey)`. With `EncryptionKey + CdnUrl`, the attacker could pull the encrypted blob from the CDN and decrypt it locally. +- **Impact (had it shipped)**: Confidentiality of every published resource (firmware, model weights, installer payloads) reduced to "any authenticated session". +- **Resolution rationale**: rather than tightening the policy or filtering the response, the user assessment is that the OTA delivery model itself is no longer needed — the suite is now installed/updated through the browser. Removing the surface eliminates the vulnerability and reduces the attack surface; F-5 (lazy-loaded `EncryptionMasterKey`) is also closed automatically. + +### F-2: Path traversal via `dataFolder` route segment (HIGH — pre-existing, re-flagged) + +- **Severity**: High +- **Category**: Broken Access Control / Injection (Path) +- **Locations**: + - `Azaion.Services/ResourcesService.cs:20-25` — `Path.Combine(ResourcesFolder, dataFolder)` accepts `..` and absolute paths without validation. + - Consumed by `Program.cs:201, 213, 219, 224` — `/resources/{dataFolder?}` (any-auth upload), `/resources/list/{dataFolder?}` (any-auth read), `/resources/clear/{dataFolder?}` (admin), `/resources/get/{dataFolder?}` (any-auth read). +- **Description**: `Path.Combine("Content", "../../etc")` resolves to `etc`, escaping the configured root. A non-admin caller can: + - List arbitrary directories via `/resources/list/../../`. + - Read encrypted contents of arbitrary files via `/resources/get/../../` provided they know a filename. + - Write into arbitrary directories the process can write to via `/resources/` upload. +- **Impact**: Server-side file disclosure and arbitrary file write bounded by process privileges. Pre-existing — already noted in `_docs/00_problem/security_approach.md` "Known Security Observations" #3 — but the cycle-1 audit re-confirms it is unmitigated. +- **Remediation**: + - Sanitize `dataFolder`: reject any value containing `..`, `/`, `\`, or starting with a drive letter; alternatively, allow only `[A-Za-z0-9_-]+` segments. + - Verify that the resolved absolute path starts with the resolved `ResourcesFolder` absolute path — `Path.GetFullPath(combined).StartsWith(Path.GetFullPath(root))` — and reject otherwise. + +### F-3: `users.email` lacks a UNIQUE constraint — race in both `RegisterUser` and (cycle-1) `RegisterDevice` (HIGH — **CLOSED**) + +- **Severity**: High +- **Status**: **CLOSED** (2026-05-13). +- **Resolution**: + - Added migration `env/db/06_users_email_unique.sql` containing `CREATE UNIQUE INDEX IF NOT EXISTS users_email_uidx ON public.users (email);`. The migration is wired into `e2e/db-init/00_run_all.sh`. + - `Azaion.Services/UserService.cs` — `RegisterUser` no longer check-then-inserts. It catches `Npgsql.PostgresException` with `SqlState == PostgresErrorCodes.UniqueViolation` (23505) and rethrows as `BusinessException(EmailExists)`. The race is closed atomically by the index. + - `RegisterDevice` was refactored to delegate the row insert to `RegisterUser` (the user's explicit guidance: "reuse the code in the implementation RegisterDevice -> should call RegisterUser"). Two concurrent provisioning calls that race on the same serial now hit the UNIQUE INDEX and surface `BusinessException(EmailExists)`; the caller can retry. +- **Residual risk**: a Postgres sequence for device serials (`device_serial_seq`) would also remove the serial-allocation race window and avoid the retry. Out of scope for this audit fix; can be added as a follow-up. + +### F-4: `/devices` returns plaintext device password in the JSON body (MEDIUM — accepted by design, hardening required) + +- **Severity**: Medium +- **Category**: Cryptographic Failures / Data Exposure +- **Locations**: + - `Azaion.AdminApi/Program.cs:158-162` + - `Azaion.Services/UserService.cs:84-89` (assembles `RegisterDeviceResponse`) +- **Description**: The endpoint deliberately returns the plaintext device password "exactly once" so the provisioning script can write it to `device.conf`. ApiAdmin-only, so abuse blast radius is bounded — but the password is reachable via: + - Reverse-proxy access logs that capture response bodies + - Browser DevTools / network history when triggered from the admin UI + - Swagger UI's "Try it out" response panel in any environment where Swagger is exposed (today: `IsDevelopment()` only — verified). +- **Impact**: Credentials may persist in unintended log sinks beyond their intended one-shot consumption. +- **Remediation**: + - Set response headers `Cache-Control: no-store, no-cache`, `Pragma: no-cache` on this endpoint specifically. + - Document an SRE runbook entry: do NOT enable response-body logging on the reverse proxy for `POST /devices`. + - Optional: add an `X-One-Shot-Credential: true` header so log scrubbers can match-and-mask. + +### F-5: `EncryptionMasterKey` validation is lazy — first failing request, not startup (MEDIUM — **CLOSED**) + +- **Severity**: Medium +- **Status**: **CLOSED** (2026-05-13) — `ResourcesConfig.EncryptionMasterKey` and the `ResourceUpdateService.MasterKey` getter were both deleted along with the OTA feature (see F-1). The lazy-validation surface no longer exists; `appsettings.json` and `docker-compose.test.yml` no longer reference the field. + +### F-6: API container runs as root (MEDIUM) + +- **Severity**: Medium +- **Category**: Security Misconfiguration +- **Location**: `Dockerfile:1, 20-25` +- **Description**: `mcr.microsoft.com/dotnet/aspnet:10.0` defaults to `root`. There is no `USER` directive after `FROM base AS final`. CIS Docker Benchmark §4.1 calls this out: a process running as root inside the container has more privileges than necessary, and a container escape (CVE-2024-21626 class) becomes a root-on-host exploit. +- **Impact**: Defense-in-depth weakness. No specific exploit, but failure mode is severe. +- **Remediation**: Add `USER app` to the final stage (the .NET 10 base image already provisions a non-root `app` user, UID 1654). The Content/log directory permissions need to be checked once the change is made. + +### F-7: SHA-384 password hashing without per-user salt or KDF (MEDIUM — pre-existing) + +- **Severity**: Medium +- **Category**: Cryptographic Failures +- **Locations**: + - `Azaion.Services/Security.cs:11-12` — `ToHash()` hashes raw UTF-8 bytes with SHA-384. + - `Azaion.Services/UserService.cs:44, 110, 78` — used for `RegisterUser`, `ValidateUser`, `RegisterDevice` (cycle-1 added the `RegisterDevice` call site). +- **Description**: SHA-384 is a fast cryptographic hash, not a password-hashing algorithm. No per-user salt, no work factor, no memory hardness. A leaked `password_hash` column lets an offline attacker grind ~10⁹ candidates per second per GPU. +- **Impact**: Database leak directly compromises all user passwords in tractable time. +- **Remediation**: Migrate to Argon2id (e.g., `Konscious.Security.Cryptography.Argon2`) or bcrypt (`BCrypt.Net-Next`) with per-user salt. Two-phase rollout: rehash on next successful login until the SHA-384 column is empty, then drop it. + +### F-8: No rate limiting on `/login` (MEDIUM — pre-existing) + +- **Severity**: Medium +- **Category**: Auth Failures +- **Location**: `Azaion.AdminApi/Program.cs:137-143` +- **Description**: Combined with F-7, an attacker who can reach `/login` can brute-force credentials. ASP.NET Core 10 ships `AddRateLimiter()` out of the box. +- **Remediation**: Add a fixed-window or sliding-window limiter scoped to `/login` (e.g., 10 requests / IP / minute, with exponential backoff). + +### F-9: `LoginRequest`, `SetUserQueueOffsetsRequest` lack server-side validation (LOW — pre-existing) + +- **Severity**: Low +- **Category**: Security Misconfiguration +- **Locations**: + - `Azaion.Common/Requests/LoginRequest.cs` — no validator class + - `Azaion.Common/Requests/SetUserQueueOffsetsRequest.cs` — no validator class +- **Description**: Other request DTOs use `AbstractValidator` (`RegisterUserValidator`, `GetUpdateValidator`, etc.). These two are unguarded — `LoginRequest` accepts any-length email/password, `SetUserQueueOffsetsRequest` accepts any email shape and any offsets payload. +- **Remediation**: Add validators with `EmailAddress()` + `MinimumLength(12)` (matching `RegisterUserValidator`) and bounds checks for `Offsets`. + +### F-10: Hardcoded credentials and JWT secret in test fixtures (LOW — accepted) + +- **Severity**: Low +- **Category**: Hardcoded Credentials +- **Locations**: + - `docker-compose.test.yml:31-33, 37` — DB credentials, JWT secret, encryption master key as compose env vars. + - `e2e/Azaion.E2E/appsettings.test.json:4-7` — `AdminPassword`, `UploaderPassword`, `JwtSecret`. +- **Description**: These are e2e-only and consistent across the harness. They are NOT used in production builds. Flagged here for visibility only — they MUST NEVER drift into the prod compose / appsettings. +- **Remediation**: Add a CI guard: fail the pipeline if any of these literals appear in `Azaion.AdminApi/appsettings.json` or `Azaion.AdminApi/appsettings.Production.json`. + +### F-11: `env/db/01_permissions.sql` ships placeholder DB passwords as a setup template (LOW) + +- **Severity**: Low +- **Category**: Hardcoded Credentials (template / docs) +- **Location**: `env/db/01_permissions.sql:2, 7, 12` — `superadmin-pass`, `admin-pass`, `readonly-pass`. +- **Description**: The file is the operator setup template. The e2e harness immediately overrides these with `test_password` (`e2e/db-init/99_test_seed.sql:1-2`), so the placeholders never reach a runtime. But the file lives at `env/db/` with no header comment marking it template-only. +- **Remediation**: Add a top-of-file comment `-- TEMPLATE: replace placeholder passwords before applying to any environment.` Consider renaming to `01_permissions.example.sql`. + +### F-12: Unstructured logging in `ResourcesService.SaveResource` (LOW) + +- **Severity**: Low +- **Category**: Logging Failures (operational) +- **Location**: `Azaion.Services/ResourcesService.cs:63` — `logger.LogInformation($"Resource {data.FileName} Saved Successfully")`. +- **Description**: String interpolation defeats Serilog's structured property capture; the `FileName` is not searchable as a field. Not a security issue, but flagged because the security-event-logging principle (audit trail) requires structured fields. +- **Remediation**: `logger.LogInformation("Resource {FileName} saved successfully", data.FileName);` + +### F-13: No HTTPS enforcement in application code (LOW — pre-existing, design) + +- **Severity**: Low +- **Category**: Cryptographic Failures +- **Location**: `Azaion.AdminApi/Program.cs` — no `app.UseHttpsRedirection()`, no `Hsts`. +- **Description**: HTTPS is assumed at the reverse proxy. Acceptable design choice if and only if the reverse proxy and its config are part of the secure boundary. +- **Remediation**: Document the assumption in a deployment runbook; consider `UseHsts()` when the upstream chain terminates TLS. + +## Self-verification + +- [x] All source directories scanned (`Azaion.AdminApi/`, `Azaion.Services/`, `Azaion.Common/`, `env/db/`, `Dockerfile`) +- [x] Each finding has file path and (where relevant) line numbers +- [x] No false positives from test files or comments — test-fixture credentials (F-10) are explicitly framed as accepted-risk diff --git a/_docs/06_metrics/perf_2026-05-13_cycle1.md b/_docs/06_metrics/perf_2026-05-13_cycle1.md new file mode 100644 index 0000000..d329f14 --- /dev/null +++ b/_docs/06_metrics/perf_2026-05-13_cycle1.md @@ -0,0 +1,99 @@ +# Performance Test Report — Cycle 1 + +**Date**: 2026-05-13 +**Cycle**: 1 +**Verdict**: **PASS** — all thresholds met, 0% error rate. +**Runner**: k6 v2.0.0 (local) against `docker-compose.test.yml` (Postgres 16-alpine + .NET admin API), seeded with 500 perf users. +**Artifacts**: `scripts/perf-scenarios.js`, `scripts/run-performance-tests.sh`, raw JSON at `e2e/test-results/perf-summary.json`. + +## Scenarios run + +| ID | Scenario | Threshold | Observed (p95) | Verdict | +|----|----------|-----------|---------------:|---------| +| NFT-PERF-01 | Login (10 VUs, 30s) | p95 < 500ms · err < 1% | **33.4ms · 0%** | Pass (15× headroom) | +| NFT-PERF-04 | User list (10 VUs, 30s, 500 users seeded) | p95 < 1000ms · err < 1% | **152.5ms · 0%** | Pass (6.5× headroom) | + +## Scenarios skipped + +| ID | Scenario | Reason | +|----|----------|--------| +| NFT-PERF-02 | Encrypted resource download (small) | Endpoint deleted (AZ-183 OTA revert + AZ-197 hardware removal). Pruned from `_docs/02_document/tests/performance-tests.md`. | +| NFT-PERF-03 | Encrypted resource download (large) | Same — the OTA / hardware-bound download path no longer exists. | + +## Detailed metrics (full distribution) + +### NFT-PERF-01 — Login + +| Metric | Value | +|--------|------:| +| Iterations | 2 617 | +| min | 1.3 ms | +| median | 6.3 ms | +| avg | 13.7 ms | +| p90 | 18.6 ms | +| **p95** | **33.4 ms** | +| max | 630.0 ms (single outlier — first request after JIT/connection-pool warmup) | +| Error rate | 0.00% | +| Checks | 2 617 / 2 617 (status 200, token returned) | + +### NFT-PERF-04 — User list (500 users) + +| Metric | Value | +|--------|------:| +| Iterations | 1 944 | +| min | 3.1 ms | +| median | 12.0 ms | +| avg | 43.8 ms | +| p90 | 86.9 ms | +| **p95** | **152.5 ms** | +| max | 1 974.6 ms (cold-cache outlier) | +| Error rate | 0.00% | +| Checks | 1 944 / 1 944 (status 200, ≥ 500 users returned) | + +### Aggregate + +| Metric | Value | +|--------|------:| +| Total iterations | 4 561 | +| Total HTTP requests | 4 562 | +| Aggregate throughput | 65.1 req/s | +| Max VUs | 20 (10 per scenario, sequential) | +| Run duration | ~70 s (incl. 5 s gap between scenarios) | + +## Threshold table + +All four `options.thresholds` entries returned `ok: true`: + +``` +http_req_duration{scenario:nft_perf_01_login} p(95)<500 → ok +http_req_duration{scenario:nft_perf_04_user_list} p(95)<1000 → ok +http_req_failed{scenario:nft_perf_01_login} rate<0.01 → ok (rate=0) +http_req_failed{scenario:nft_perf_04_user_list} rate<0.01 → ok (rate=0) +``` + +## Environment + +- Host: macOS 25.4.0 (Apple Silicon) +- Docker Desktop, single host, no resource throttling +- SUT: `system-under-test` container built from repo `Dockerfile`, running on `http://localhost:8080` +- DB: `test-db` (postgres:16-alpine), in-process to the same Docker host +- Seed: functional fixtures from `e2e/db-init/00_run_all.sh` + `99_test_seed.sql`, plus 500 dummy `perf-user-NNNNN@perf.azaion.com` rows inserted by `run-performance-tests.sh` after SUT readiness check +- k6 v2.0.0 (Homebrew bottle, arm64) + +## Caveats / coverage gaps + +1. **Single-host run** — perf was measured with k6 and the SUT on the same machine, no network RTT, no inter-AZ latency. Production numbers will be higher; the 15×/6.5× headroom should absorb that comfortably for an internal admin API. +2. **No DB warmup phase** — both p99/max values include cold-cache outliers (login max 630ms, user-list max ~2s). The p95 already excludes those, but a future iteration could add a 5–10s warmup ramp. +3. **No realistic load on the user-list filter path** — only the unfiltered `GET /users` is exercised. Adding a `?searchEmail=` variant would catch the case where the LinqToDB `WhereIf` fails to fold into the SQL. +4. **No `/classes` CRUD perf coverage** — cycle 1 added these endpoints (AZ-513) but the perf spec was not extended. Recommend adding a NFT-PERF-05 in the next test-spec sync. +5. **`acceptance_criteria.md` is stale post-cycle-1** — AC-13/14/15/16 (hardware binding) and AC-17–24 (resource management) reference deleted features. Step 12 (Test-Spec Sync) of cycle 1 missed this. Surface in Step 17 retro and clean up in cycle 2. + +## Recommendations for next cycle + +- **Cycle 2 test-spec sync must prune AC-13..24** and add an AC for `/classes` CRUD. +- **Add NFT-PERF-05** for `POST /classes` and `PATCH /classes/{id}` to cover the new write paths. +- **CI gate**: wire `scripts/run-performance-tests.sh` into the deploy pipeline so threshold breaches block release. Today it is run-on-demand only. + +## Verdict logic + +PASS — all thresholds met, no failed checks, no errors, no warn-band scenarios. Auto-chain to Step 16 (Deploy). diff --git a/_docs/06_metrics/postmortem_template.md b/_docs/06_metrics/postmortem_template.md new file mode 100644 index 0000000..846baad --- /dev/null +++ b/_docs/06_metrics/postmortem_template.md @@ -0,0 +1,98 @@ +# Production Incident Post-Mortem — Template + +**Save as**: `_docs/06_metrics/postmortem__.md` + +**Required**: every production rollback (per `_docs/04_deploy/deployment_procedures.md` §5). +**Recommended**: any user-impacting incident even if no rollback was needed. +**Owner**: the on-call engineer at the time of the incident. +**Deadline**: within 24 hours of the incident. + +--- + +## Header + +| Field | Value | +|-------|-------| +| Incident date | YYYY-MM-DD | +| Detection time (UTC) | YYYY-MM-DDTHH:MM:SSZ | +| Mitigation time (UTC) | YYYY-MM-DDTHH:MM:SSZ | +| Duration (user-impacting) | mm:ss | +| Affected environment | staging / production | +| Detected by | alert / smoke test / user report / operator | +| Severity | Critical / High / Medium | +| Deploy SHA at incident start | `` | +| Rollback SHA (if rolled back) | `` | + +## Timeline (UTC) + +``` +HH:MM (source: alert / Slack / log file) +HH:MM +… +``` + +Be liberal with entries — every paging, every Slack message, every action taken. The point is to make the post-mortem reproducible without re-asking the operator. + +## Detection + +How was the issue first noticed? + +- Alert: which one? Was the threshold appropriate? Did it fire in time? +- User report: how did the user reach us? How long after the incident started? +- Smoke test: which step? (1–6 from `scripts/smoke.sh`) + +## Impact + +- User impact (number of failed requests, revenue, data loss — be specific) +- Internal impact (engineering time, lost productivity) +- Regulatory / compliance impact (if any) + +## Root cause + +One paragraph. Include the specific commit / config change / external event. Link to the failing test / log line that proves the cause. + +> Avoid "human error" as a root cause — it's almost never a useful answer. Focus on the system gap that allowed the human action to cause harm. + +## Repair + +- What action mitigated the user impact? (Rollback, config change, restart, etc.) +- What action fully resolved the issue? (Code fix, infrastructure change, etc.) +- Were there any side-effects of the repair? (Data loss, missed messages, etc.) + +## Detection gaps + +What would we want the system to have done instead? + +- New alert(s) needed? With what threshold? +- New health check needed? At what level? +- Better dashboard panel? +- New smoke-test step? + +## Prevention + +| Owner | Action | Target date | +|-------|--------|-------------| +| @… | | YYYY-MM-DD | +| @… | … | YYYY-MM-DD | + +Each row MUST be tracked as a Jira ticket (per `.cursor/rules/tracker.mdc`). Reference the ticket here. + +## What went well + +(Resist the urge to skip this. Reinforces good habits.) + +- … + +## What was lucky + +(Not the same as "what went well". Things that worked but only because of fortunate timing or configuration that we didn't choose deliberately.) + +- … + +## Appendix: evidence links + +- Container logs: `/var/log/azaion/rollback-.log` +- Container inspect: `/var/log/azaion/rollback-.inspect.json` +- Grafana dashboard snapshot: +- Slack thread: +- Deploy ticket: diff --git a/_docs/_autodev_state.md b/_docs/_autodev_state.md index 23681c8..4243254 100644 --- a/_docs/_autodev_state.md +++ b/_docs/_autodev_state.md @@ -2,13 +2,13 @@ ## Current Step flow: existing-code -step: 10 -name: Implement -status: complete +step: 17 +name: Retrospective +status: not_started sub_step: - phase: 16 - name: tests-passed - detail: "cycle 1 done: AZ-513 + AZ-196 + AZ-183 + AZ-197 all in In Testing on Jira; full suite 48/48 e2e + 2/2 unit" + phase: 0 + name: awaiting-invocation + detail: "" retry_count: 0 cycle: 1 tracker: jira diff --git a/deploy.cmd b/deploy.cmd deleted file mode 100644 index 8cc12e0..0000000 --- a/deploy.cmd +++ /dev/null @@ -1,3 +0,0 @@ -docker build -t docker.azaion.com/api . -docker login docker.azaion.com -docker push docker.azaion.com/api \ No newline at end of file diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 5cf1113..8370ecc 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -34,17 +34,12 @@ services: ResourcesConfig__ResourcesFolder: "Content" ResourcesConfig__SuiteInstallerFolder: "suite" ResourcesConfig__SuiteStageInstallerFolder: "suite-stage" - ResourcesConfig__EncryptionMasterKey: "test-master-key-for-resources-table-do-not-use-in-prod" ports: - "8080:8080" volumes: - test-resources:/app/Content healthcheck: - test: - [ - "CMD-SHELL", - "/bin/bash -c 'exec 3<>/dev/tcp/127.0.0.1/8080'", - ] + test: ["CMD", "curl", "--fail", "--silent", "--show-error", "http://localhost:8080/health/live"] interval: 10s timeout: 5s retries: 8 diff --git a/docker.test/Dockerfile b/docker.test/Dockerfile deleted file mode 100644 index 72cdee7..0000000 --- a/docker.test/Dockerfile +++ /dev/null @@ -1,2 +0,0 @@ -FROM alpine:latest -CMD echo hello \ No newline at end of file diff --git a/e2e/Azaion.E2E/Tests/ResourceUpdateTests.cs b/e2e/Azaion.E2E/Tests/ResourceUpdateTests.cs deleted file mode 100644 index 932ca53..0000000 --- a/e2e/Azaion.E2E/Tests/ResourceUpdateTests.cs +++ /dev/null @@ -1,176 +0,0 @@ -using System.Net; -using System.Net.Http.Json; -using System.Text.Json; -using Azaion.E2E.Helpers; -using FluentAssertions; -using Xunit; - -namespace Azaion.E2E.Tests; - -[Collection("E2E")] -public sealed class ResourceUpdateTests -{ - private static readonly JsonSerializerOptions ResponseJsonOptions = new() - { - PropertyNameCaseInsensitive = true - }; - - private sealed record ResourceUpdateItemDto( - string ResourceName, - string Version, - string CdnUrl, - string Sha256, - string EncryptionKey, - long SizeBytes); - - private readonly TestFixture _fixture; - - public ResourceUpdateTests(TestFixture fixture) => _fixture = fixture; - - private static object PublishBody(string resourceName, string version, string arch = "arm64", - string stage = "stage", string encryptionKey = "test-resource-key-001") => new - { - resourceName, - devStage = stage, - architecture = arch, - version, - cdnUrl = $"https://cdn.example.com/{resourceName}-{version}.bin", - sha256 = "abc123def456789", - encryptionKey, - sizeBytes = 1024L - }; - - private async Task NewUploaderTokenAsync() - { - using var loginClient = _fixture.CreateApiClient(); - return await loginClient.LoginAsync(_fixture.UploaderEmail, _fixture.UploaderPassword); - } - - [Fact] - public async Task AC2_GetUpdate_returns_resources_newer_than_device_version() - { - // Arrange - var uploaderToken = await NewUploaderTokenAsync(); - using var uploaderClient = _fixture.CreateAuthenticatedClient(uploaderToken); - using var deviceClient = _fixture.CreateAuthenticatedClient(_fixture.AdminToken); - - var arch = "arm64"; - var stage = $"stage-{Guid.NewGuid():N}".Substring(0, 12); - var resourceName = $"annotations-{Guid.NewGuid():N}".Substring(0, 20); - - using var publish = await uploaderClient.PostAsync("/resources/publish", - PublishBody(resourceName, "2026-04-13", arch, stage, "device-key-AC2")); - publish.StatusCode.Should().Be(HttpStatusCode.OK); - - // Act - using var response = await deviceClient.PostAsync("/get-update", new - { - architecture = arch, - devStage = stage, - currentVersions = new Dictionary { [resourceName] = "2026-02-25" } - }); - - // Assert - response.StatusCode.Should().Be(HttpStatusCode.OK); - var items = await response.Content.ReadFromJsonAsync>(ResponseJsonOptions); - items.Should().NotBeNull(); - items!.Should().HaveCount(1); - items![0].ResourceName.Should().Be(resourceName); - items[0].Version.Should().Be("2026-04-13"); - items[0].CdnUrl.Should().Be($"https://cdn.example.com/{resourceName}-2026-04-13.bin"); - items[0].Sha256.Should().Be("abc123def456789"); - items[0].EncryptionKey.Should().Be("device-key-AC2", - "the column is AES-encrypted at rest but the response must contain plaintext for the device"); - items[0].SizeBytes.Should().Be(1024L); - } - - [Fact] - public async Task AC3_GetUpdate_returns_empty_when_device_already_has_latest() - { - // Arrange - var uploaderToken = await NewUploaderTokenAsync(); - using var uploaderClient = _fixture.CreateAuthenticatedClient(uploaderToken); - using var deviceClient = _fixture.CreateAuthenticatedClient(_fixture.AdminToken); - - var arch = "arm64"; - var stage = $"stage-{Guid.NewGuid():N}".Substring(0, 12); - var resourceName = $"weights-{Guid.NewGuid():N}".Substring(0, 20); - - using var publish = await uploaderClient.PostAsync("/resources/publish", - PublishBody(resourceName, "2026-04-13", arch, stage)); - publish.StatusCode.Should().Be(HttpStatusCode.OK); - - // Act - using var response = await deviceClient.PostAsync("/get-update", new - { - architecture = arch, - devStage = stage, - currentVersions = new Dictionary { [resourceName] = "2026-04-13" } - }); - - // Assert - response.StatusCode.Should().Be(HttpStatusCode.OK); - var items = await response.Content.ReadFromJsonAsync>(ResponseJsonOptions); - items.Should().NotBeNull(); - items!.Should().BeEmpty(); - } - - [Fact] - public async Task AC5_Cache_is_invalidated_on_publish() - { - // Arrange - var uploaderToken = await NewUploaderTokenAsync(); - using var uploaderClient = _fixture.CreateAuthenticatedClient(uploaderToken); - using var deviceClient = _fixture.CreateAuthenticatedClient(_fixture.AdminToken); - - var arch = "arm64"; - var stage = $"stage-{Guid.NewGuid():N}".Substring(0, 12); - var resourceName = $"models-{Guid.NewGuid():N}".Substring(0, 20); - - using var publishV1 = await uploaderClient.PostAsync("/resources/publish", - PublishBody(resourceName, "2026-02-25", arch, stage)); - publishV1.StatusCode.Should().Be(HttpStatusCode.OK); - - var deviceVersionsAtV1 = new { architecture = arch, devStage = stage, - currentVersions = new Dictionary { [resourceName] = "2026-02-25" } }; - - using (var primeCache = await deviceClient.PostAsync("/get-update", deviceVersionsAtV1)) - { - primeCache.StatusCode.Should().Be(HttpStatusCode.OK); - var primed = await primeCache.Content.ReadFromJsonAsync>(ResponseJsonOptions); - primed!.Should().BeEmpty(); - } - - // Act - using var publishV2 = await uploaderClient.PostAsync("/resources/publish", - PublishBody(resourceName, "2026-04-13", arch, stage)); - publishV2.StatusCode.Should().Be(HttpStatusCode.OK); - - using var afterPublish = await deviceClient.PostAsync("/get-update", deviceVersionsAtV1); - - // Assert - afterPublish.StatusCode.Should().Be(HttpStatusCode.OK); - var items = await afterPublish.Content.ReadFromJsonAsync>(ResponseJsonOptions); - items.Should().NotBeNull(); - items!.Should().HaveCount(1, "publish must invalidate the per-(arch,stage) latest-versions cache"); - items![0].Version.Should().Be("2026-04-13"); - } - - [Fact] - public async Task GetUpdate_without_jwt_returns_401() - { - // Arrange - using var client = _fixture.CreateApiClient(); - - // Act - using var response = await client.PostAsync("/get-update", new - { - architecture = "arm64", - devStage = "stage", - currentVersions = new Dictionary() - }); - - // Assert - response.StatusCode.Should().Be(HttpStatusCode.Unauthorized); - } -} diff --git a/e2e/db-init/00_run_all.sh b/e2e/db-init/00_run_all.sh index 04ad5c4..65a7af9 100755 --- a/e2e/db-init/00_run_all.sh +++ b/e2e/db-init/00_run_all.sh @@ -6,5 +6,5 @@ sed 's/^drop table users;/drop table if exists users;/' "$SQL_DIR/02_structure.s | psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion -f "$SQL_DIR/03_add_timestamp_columns.sql" psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion -f "$SQL_DIR/04_detection_classes.sql" -psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion -f "$SQL_DIR/05_resources.sql" +psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion -f "$SQL_DIR/06_users_email_unique.sql" psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -d azaion -f /opt/test-seed.sql diff --git a/env/db/05_resources.sql b/env/db/05_resources.sql deleted file mode 100644 index 70fcb1e..0000000 --- a/env/db/05_resources.sql +++ /dev/null @@ -1,24 +0,0 @@ --- Resources table — stores per-artifact metadata for fleet OTA updates. Populated by CI/CD --- via POST /resources/publish; queried by devices via POST /get-update. AZ-183. - -create table if not exists resources -( - id uuid primary key, - resource_name varchar(120) not null, - dev_stage varchar(40) not null, - architecture varchar(40) not null, - version varchar(40) not null, - cdn_url varchar(500) not null, - sha256 varchar(128) not null, - encryption_key text not null, -- AES-encrypted at rest with ResourcesConfig.EncryptionMasterKey - size_bytes bigint not null, - created_at timestamp not null default now() -); - --- Latest-version-per-resource lookups filter by (architecture, dev_stage); index supports --- both the in-memory cache miss path and the per-(arch,stage) GROUP BY. -create index if not exists resources_arch_stage_idx - on public.resources (architecture, dev_stage, resource_name, version); - -grant select, insert, update, delete on public.resources to azaion_admin; -grant select on public.resources to azaion_reader; diff --git a/env/db/06_users_email_unique.sql b/env/db/06_users_email_unique.sql new file mode 100644 index 0000000..bd30230 --- /dev/null +++ b/env/db/06_users_email_unique.sql @@ -0,0 +1,6 @@ +-- Enforce uniqueness on users.email. Closes the duplicate-row race in +-- UserService.RegisterUser / RegisterDevice (security audit finding F-3, 2026-05-13). +-- Application code (Azaion.Services/UserService.cs) catches the resulting +-- Npgsql.PostgresException(SqlState=23505) and surfaces it as +-- BusinessException(EmailExists, code 5). +create unique index if not exists users_email_uidx on public.users (email); diff --git a/scripts/_lib.sh b/scripts/_lib.sh new file mode 100644 index 0000000..d982f98 --- /dev/null +++ b/scripts/_lib.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# scripts/_lib.sh — shared helpers sourced by all deploy scripts. +# +# This file is sourced (not executed); do not set -e at the top — leave error +# handling to the caller. The helpers always check their own preconditions. + +# ----- logging -------------------------------------------------------------- +log_info() { printf '\033[32m[deploy]\033[0m %s\n' "$*" >&2; } +log_warn() { printf '\033[33m[deploy WARN]\033[0m %s\n' "$*" >&2; } +log_error() { printf '\033[31m[deploy ERROR]\033[0m %s\n' "$*" >&2; } +die() { log_error "$*"; exit 1; } + +# ----- input validation ----------------------------------------------------- +require_env() { + local var + for var in "$@"; do + if [[ -z "${!var:-}" ]]; then + die "Required environment variable not set: $var" + fi + done +} + +require_cmd() { + local cmd + for cmd in "$@"; do + if ! command -v "$cmd" >/dev/null 2>&1; then + die "Required command not found on PATH: $cmd" + fi + done +} + +# ----- env overlay ---------------------------------------------------------- +# load_env_overlay +# 1. Sources scripts/_defaults.env if present (developer-friendly defaults). +# 2. Sources secrets/.public.env (committed plain-text). +# 3. Decrypts secrets/.env via sops + age and sources the result. +# The decrypted intermediate is written to a mktemp file and removed on EXIT. +load_env_overlay() { + local env="$1" + local script_dir repo_root public_file enc_file decrypted + + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + repo_root="$(cd "$script_dir/.." && pwd)" + + if [[ -f "$repo_root/.env" ]]; then + # Local dev convenience; harmless on a production host because the + # production host should not have a .env in REPO_ROOT. + log_info "Sourcing $repo_root/.env" + set -a; . "$repo_root/.env"; set +a + fi + + public_file="$repo_root/secrets/${env}.public.env" + if [[ -f "$public_file" ]]; then + log_info "Sourcing $public_file" + set -a; . "$public_file"; set +a + else + log_warn "No $public_file — relying on environment / .env only" + fi + + enc_file="$repo_root/secrets/${env}.env" + if [[ -f "$enc_file" ]]; then + require_cmd sops age + decrypted="$(mktemp -t azaion-env.XXXXXX)" + # shellcheck disable=SC2064 + trap "rm -f '$decrypted'" EXIT INT TERM + if ! SOPS_AGE_KEY_FILE="${SOPS_AGE_KEY_FILE:-/etc/azaion/age.key}" \ + sops -d "$enc_file" > "$decrypted" 2>/tmp/sops.err; then + log_error "sops decrypt failed for $enc_file" + cat /tmp/sops.err >&2 + die "Cannot continue without secrets" + fi + chmod 600 "$decrypted" + log_info "Sourcing decrypted overlay (intermediate: $decrypted)" + set -a; . "$decrypted"; set +a + else + log_warn "No $enc_file — secret values must already be in the environment" + fi +} + +# ----- container helpers ---------------------------------------------------- +container_exists() { + docker container inspect "$1" >/dev/null 2>&1 +} + +container_running() { + [[ "$(docker container inspect -f '{{.State.Running}}' "$1" 2>/dev/null || echo false)" == "true" ]] +} + +current_image_revision() { + # Returns the org.opencontainers.image.revision label of the running + # container, or empty if the container does not exist. + docker container inspect "$1" \ + --format '{{ index .Config.Labels "org.opencontainers.image.revision" }}' 2>/dev/null || true +} diff --git a/scripts/deploy.sh b/scripts/deploy.sh new file mode 100755 index 0000000..5fafa8b --- /dev/null +++ b/scripts/deploy.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# scripts/deploy.sh — Azaion Admin API deployment orchestrator. +# +# Usage: +# ENV=staging ./scripts/deploy.sh +# ENV=production ./scripts/deploy.sh +# ./scripts/deploy.sh --rollback # uses the SHA from previous_tags.env +# ./scripts/deploy.sh --help +# +# This is the single entry point; do not call the per-step scripts (pull/stop/ +# start/health) directly except from this file. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# shellcheck source=./_lib.sh +. "$SCRIPT_DIR/_lib.sh" + +usage() { + cat <<'EOF' +Usage: + ENV=staging|production ./scripts/deploy.sh + ./scripts/deploy.sh --rollback + ./scripts/deploy.sh --help + +Environment: + ENV Required. "staging" or "production". Selects which + secrets/.env (sops-encrypted) is decrypted. + REGISTRY_HOST, + REGISTRY_IMAGE Registry hostname and image path; loaded from + secrets/.public.env unless already set. + DEPLOY_* See .env.example. + +Notes: + - Run this on the deploy target host (it does not SSH for you in cycle 1). + - Requires: docker, sops, age, curl, jq. +EOF +} + +ROLLBACK=0 +SHA_TAG="" +for arg in "$@"; do + case "$arg" in + --help|-h) usage; exit 0 ;; + --rollback) ROLLBACK=1 ;; + -*) die "Unknown flag: $arg (use --help)" ;; + *) SHA_TAG="$arg" ;; + esac +done + +require_env ENV +require_cmd docker sops age curl jq + +load_env_overlay "$ENV" + +if [[ "$ROLLBACK" -eq 1 ]]; then + PREV_FILE="$REPO_ROOT/scripts/.previous_tags.env" + [[ -f "$PREV_FILE" ]] || die "No $PREV_FILE — cannot determine rollback target" + # shellcheck disable=SC1090 + . "$PREV_FILE" + [[ -n "${PREVIOUS_SHA_TAG:-}" ]] || die "PREVIOUS_SHA_TAG missing in $PREV_FILE" + SHA_TAG="$PREVIOUS_SHA_TAG" + log_warn "ROLLBACK requested → redeploying $SHA_TAG" +fi + +[[ -n "$SHA_TAG" ]] || die "Missing . Pass the immutable SHA-tag (e.g. a1b2c3d4e5f6-arm) or use --rollback." + +export REGISTRY_TAG="$SHA_TAG" + +log_info "Deploy plan" +log_info " ENV=$ENV" +log_info " REGISTRY_HOST=$REGISTRY_HOST" +log_info " REGISTRY_IMAGE=$REGISTRY_IMAGE" +log_info " REGISTRY_TAG=$REGISTRY_TAG" +log_info " DEPLOY_CONTAINER_NAME=$DEPLOY_CONTAINER_NAME" +log_info " DEPLOY_HOST_PORT=$DEPLOY_HOST_PORT" + +"$SCRIPT_DIR/pull-images.sh" +"$SCRIPT_DIR/stop-services.sh" +"$SCRIPT_DIR/start-services.sh" +"$SCRIPT_DIR/health-check.sh" + +log_info "Deploy succeeded — $REGISTRY_HOST/$REGISTRY_IMAGE:$REGISTRY_TAG is live as $DEPLOY_CONTAINER_NAME" diff --git a/scripts/health-check.sh b/scripts/health-check.sh new file mode 100755 index 0000000..9938057 --- /dev/null +++ b/scripts/health-check.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# scripts/health-check.sh — poll /health/ready until 200 or timeout. Used as +# the post-start gate by deploy.sh. Returns non-zero on any failure. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/_lib.sh" + +usage() { + cat <<'EOF' +Usage: ./scripts/health-check.sh [--help] + +Reads from the environment: + DEPLOY_HOST_PORT port the container is published on (default 4000) + HEALTH_BASE_URL full URL override; defaults to http://127.0.0.1:$DEPLOY_HOST_PORT + HEALTH_TIMEOUT seconds total to wait (default 60) + HEALTH_INTERVAL seconds between attempts (default 2) +EOF +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && { usage; exit 0; } + +require_cmd curl +PORT="${DEPLOY_HOST_PORT:-4000}" +BASE_URL="${HEALTH_BASE_URL:-http://127.0.0.1:$PORT}" +TIMEOUT="${HEALTH_TIMEOUT:-60}" +INTERVAL="${HEALTH_INTERVAL:-2}" + +# Liveness first (cheap; fails only if the process is wedged). +log_info "Probing $BASE_URL/health/live" +if ! curl --fail --silent --show-error --max-time 3 "$BASE_URL/health/live" >/dev/null; then + die "/health/live did not return 200 — container is not responsive" +fi + +log_info "Polling $BASE_URL/health/ready (timeout=${TIMEOUT}s, interval=${INTERVAL}s)" +DEADLINE=$(( $(date +%s) + TIMEOUT )) +ATTEMPT=0 +while :; do + ATTEMPT=$((ATTEMPT + 1)) + if BODY="$(curl --fail --silent --show-error --max-time 3 "$BASE_URL/health/ready" 2>/dev/null)"; then + log_info "/health/ready returned 200 on attempt $ATTEMPT: $BODY" + exit 0 + fi + NOW=$(date +%s) + if (( NOW >= DEADLINE )); then + die "/health/ready did not return 200 within ${TIMEOUT}s (gave up after $ATTEMPT attempts)" + fi + sleep "$INTERVAL" +done diff --git a/scripts/perf-scenarios.js b/scripts/perf-scenarios.js new file mode 100644 index 0000000..a0c5434 --- /dev/null +++ b/scripts/perf-scenarios.js @@ -0,0 +1,128 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; + +const BASE_URL = __ENV.BASE_URL || 'http://localhost:8080'; +const ADMIN_EMAIL = __ENV.ADMIN_EMAIL || 'admin@azaion.com'; +const ADMIN_PASSWORD = __ENV.ADMIN_PASSWORD || 'Admin1234'; + +export const options = { + scenarios: { + nft_perf_01_login: { + executor: 'constant-vus', + vus: 10, + duration: '30s', + exec: 'login', + tags: { scenario: 'nft_perf_01_login' }, + }, + nft_perf_04_user_list: { + executor: 'constant-vus', + vus: 10, + duration: '30s', + exec: 'userList', + tags: { scenario: 'nft_perf_04_user_list' }, + startTime: '35s', + }, + }, + thresholds: { + 'http_req_duration{scenario:nft_perf_01_login}': ['p(95)<500'], + 'http_req_duration{scenario:nft_perf_04_user_list}': ['p(95)<1000'], + 'http_req_failed{scenario:nft_perf_01_login}': ['rate<0.01'], + 'http_req_failed{scenario:nft_perf_04_user_list}': ['rate<0.01'], + }, +}; + +// setup() runs once before all VUs. Pre-fetch a JWT so the user-list scenario +// measures only the listing path, not login latency. +export function setup() { + const res = http.post( + `${BASE_URL}/login`, + JSON.stringify({ Email: ADMIN_EMAIL, Password: ADMIN_PASSWORD }), + { headers: { 'Content-Type': 'application/json' } } + ); + if (res.status !== 200) { + throw new Error(`setup: login failed (status ${res.status}): ${res.body}`); + } + const body = JSON.parse(res.body); + const token = body.token || body.Token; + if (!token) { + throw new Error(`setup: login response missing Token: ${res.body}`); + } + return { token }; +} + +export function login() { + const res = http.post( + `${BASE_URL}/login`, + JSON.stringify({ Email: ADMIN_EMAIL, Password: ADMIN_PASSWORD }), + { + headers: { 'Content-Type': 'application/json' }, + tags: { scenario: 'nft_perf_01_login' }, + } + ); + check(res, { + 'login status 200': (r) => r.status === 200, + 'login returned token': (r) => { + try { + const body = JSON.parse(r.body); + return !!(body.token || body.Token); + } catch { + return false; + } + }, + }); + sleep(0.1); +} + +export function userList(data) { + const res = http.get(`${BASE_URL}/users`, { + headers: { Authorization: `Bearer ${data.token}` }, + tags: { scenario: 'nft_perf_04_user_list' }, + }); + check(res, { + 'user list status 200': (r) => r.status === 200, + 'user list returned >= 500 users': (r) => { + try { + const arr = JSON.parse(r.body); + return Array.isArray(arr) && arr.length >= 500; + } catch { + return false; + } + }, + }); + sleep(0.1); +} + +export function handleSummary(data) { + return { + 'e2e/test-results/perf-summary.json': JSON.stringify(data, null, 2), + stdout: textSummary(data), + }; +} + +function textSummary(data) { + const lines = []; + lines.push(''); + lines.push('=== PERF SUMMARY ==='); + for (const [name, metric] of Object.entries(data.metrics)) { + if (!name.startsWith('http_req_duration') && !name.startsWith('http_req_failed')) continue; + const vals = metric.values || {}; + const parts = []; + if (vals['p(50)'] !== undefined) parts.push(`p50=${vals['p(50)'].toFixed(1)}ms`); + if (vals['p(95)'] !== undefined) parts.push(`p95=${vals['p(95)'].toFixed(1)}ms`); + if (vals['p(99)'] !== undefined) parts.push(`p99=${vals['p(99)'].toFixed(1)}ms`); + if (vals.rate !== undefined) parts.push(`rate=${(vals.rate * 100).toFixed(2)}%`); + if (vals.count !== undefined) parts.push(`count=${vals.count}`); + lines.push(` ${name}: ${parts.join(' · ')}`); + } + if (data.root_group && data.root_group.checks) { + const checks = data.root_group.checks; + if (checks.length > 0) { + lines.push('--- checks ---'); + for (const c of checks) { + lines.push(` ${c.name}: ${c.passes} pass / ${c.fails} fail`); + } + } + } + lines.push('===================='); + return lines.join('\n') + '\n'; +} diff --git a/scripts/pull-images.sh b/scripts/pull-images.sh new file mode 100755 index 0000000..fdccbc7 --- /dev/null +++ b/scripts/pull-images.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# scripts/pull-images.sh — login + pull the target image. Idempotent. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/_lib.sh" + +usage() { + cat <<'EOF' +Usage: ./scripts/pull-images.sh [--help] + +Reads from the environment (use scripts/deploy.sh which sources the overlays): + REGISTRY_HOST, REGISTRY_IMAGE, REGISTRY_TAG image coordinates + REGISTRY_USER, REGISTRY_TOKEN optional; if both set, docker login first +EOF +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && { usage; exit 0; } + +require_env REGISTRY_HOST REGISTRY_IMAGE REGISTRY_TAG +require_cmd docker + +IMAGE="$REGISTRY_HOST/$REGISTRY_IMAGE:$REGISTRY_TAG" + +if [[ -n "${REGISTRY_USER:-}" && -n "${REGISTRY_TOKEN:-}" ]]; then + log_info "Logging in to $REGISTRY_HOST as $REGISTRY_USER" + echo "$REGISTRY_TOKEN" | docker login "$REGISTRY_HOST" -u "$REGISTRY_USER" --password-stdin >/dev/null +else + log_warn "No REGISTRY_USER / REGISTRY_TOKEN — assuming pre-authenticated docker" +fi + +log_info "Pulling $IMAGE" +docker pull "$IMAGE" + +# Surface the digest for the deploy log; the operator can reference it later. +DIGEST="$(docker image inspect "$IMAGE" --format '{{ index .RepoDigests 0 }}' 2>/dev/null || true)" +if [[ -n "$DIGEST" ]]; then + log_info "Pulled digest: $DIGEST" +else + log_warn "Could not resolve digest for $IMAGE (image may not have a registry digest yet)" +fi diff --git a/scripts/run-performance-tests.sh b/scripts/run-performance-tests.sh index d4aed43..9ee56ce 100755 --- a/scripts/run-performance-tests.sh +++ b/scripts/run-performance-tests.sh @@ -3,41 +3,64 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +RESULTS_DIR="$PROJECT_ROOT/e2e/test-results" +COMPOSE_FILE="$PROJECT_ROOT/docker-compose.test.yml" +SCENARIOS_FILE="$SCRIPT_DIR/perf-scenarios.js" + +BASE_URL="${BASE_URL:-http://localhost:8080}" +PERF_USER_COUNT="${PERF_USER_COUNT:-500}" + +if ! command -v k6 >/dev/null 2>&1; then + echo "ERROR: k6 not found on PATH. Install with: brew install k6" + exit 1 +fi + +mkdir -p "$RESULTS_DIR" cleanup() { - echo "Cleaning up..." - docker compose -f "$PROJECT_ROOT/docker-compose.test.yml" down -v --remove-orphans 2>/dev/null || true + echo "=== Tearing down ===" + docker compose -f "$COMPOSE_FILE" down -v --remove-orphans || true } trap cleanup EXIT echo "=== Starting system under test ===" -docker compose -f "$PROJECT_ROOT/docker-compose.test.yml" up -d system-under-test test-db +docker compose -f "$COMPOSE_FILE" up -d system-under-test test-db echo "=== Waiting for system to be ready ===" -MAX_WAIT=30 +MAX_WAIT=60 WAIT=0 -until curl -sf http://localhost:8080/swagger/index.html > /dev/null 2>&1 || [ $WAIT -ge $MAX_WAIT ]; do +until curl -sf "$BASE_URL/swagger/index.html" > /dev/null 2>&1 || [ $WAIT -ge $MAX_WAIT ]; do sleep 1 WAIT=$((WAIT + 1)) done if [ $WAIT -ge $MAX_WAIT ]; then echo "ERROR: System did not become ready within ${MAX_WAIT}s" + docker compose -f "$COMPOSE_FILE" logs --tail=80 system-under-test || true exit 1 fi -echo "=== Running performance tests ===" -echo "Performance test runner not yet configured." -echo "Install k6, locust, or artillery and add load scenarios from:" -echo " _docs/02_document/tests/performance-tests.md" -echo "" -echo "Example with k6:" -echo " k6 run scripts/perf-scenarios.js" -echo "" -echo "Thresholds from test spec:" -echo " NFT-PERF-01: Login p95 < 500ms" -echo " NFT-PERF-02: Small file download p95 < 1000ms" -echo " NFT-PERF-03: Large file download p95 < 30000ms" -echo " NFT-PERF-04: User list p95 < 1000ms" +echo "=== Seeding $PERF_USER_COUNT perf users ===" +# Reuse the admin password hash so the rows satisfy NOT NULL on password_hash. +# These users are only used as listing volume, never for login. +docker compose -f "$COMPOSE_FILE" exec -T test-db psql -U postgres -d azaion -v ON_ERROR_STOP=1 </dev/null \ + || fail 1 "/health/live did not return 200" + +# 2. Readiness — public-facing nginx may not expose /health/ready (it's internal- +# only by design). Skip if not 200; the deploy script already checked it inside +# the host network. +step 2 "GET /health/ready (best-effort, may be unreachable from public URL)" +if curl --fail --silent --show-error --max-time 5 "$BASE_URL/health/ready" >/dev/null; then + log_info "/health/ready returned 200 (exposed publicly — verify this is intentional)" +fi + +# 3. Login → JWT +step 3 "POST /login as $SMOKE_ADMIN_EMAIL" +TOKEN_JSON="$(curl --fail --silent --show-error --max-time 10 \ + -H 'Content-Type: application/json' \ + -d "$(jq -n --arg e "$SMOKE_ADMIN_EMAIL" --arg p "$SMOKE_ADMIN_PASSWORD" '{email:$e, password:$p}')" \ + "$BASE_URL/login")" \ + || fail 3 "login request failed" +TOKEN="$(echo "$TOKEN_JSON" | jq -r '.token // .Token // empty')" +[[ -n "$TOKEN" ]] || fail 3 "login returned no token: $TOKEN_JSON" + +AUTH=(-H "Authorization: Bearer $TOKEN") + +# 4. Authenticated GET /users/current +step 4 "GET /users/current" +curl --fail --silent --show-error --max-time 5 "${AUTH[@]}" "$BASE_URL/users/current" >/dev/null \ + || fail 4 "/users/current did not return 200" + +# 5. Authenticated GET /users +step 5 "GET /users" +USERS_JSON="$(curl --fail --silent --show-error --max-time 10 "${AUTH[@]}" "$BASE_URL/users")" \ + || fail 5 "/users did not return 200" +USER_COUNT="$(echo "$USERS_JSON" | jq 'length')" +log_info "/users returned $USER_COUNT rows" + +# 6. Authenticated GET /resources/list (default folder) +step 6 "GET /resources/list" +curl --fail --silent --show-error --max-time 5 "${AUTH[@]}" "$BASE_URL/resources/list" >/dev/null \ + || fail 6 "/resources/list did not return 200" + +log_info "smoke OK — 6 checks passed" diff --git a/scripts/start-services.sh b/scripts/start-services.sh new file mode 100755 index 0000000..dd673c8 --- /dev/null +++ b/scripts/start-services.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# scripts/start-services.sh — `docker run` the API with the env overlay +# materialized into a temp file. Bind mounts come from DEPLOY_HOST_*. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/_lib.sh" + +usage() { + cat <<'EOF' +Usage: ./scripts/start-services.sh [--help] + +Reads from the environment (deploy.sh sets these): + REGISTRY_HOST, REGISTRY_IMAGE, REGISTRY_TAG + DEPLOY_CONTAINER_NAME, DEPLOY_HOST_PORT + DEPLOY_HOST_CONTENT_DIR, DEPLOY_HOST_LOGS_DIR + ASPNETCORE_ENVIRONMENT, ASPNETCORE_URLS + ASPNETCORE_ConnectionStrings__AzaionDb / __AzaionDbAdmin + ASPNETCORE_JwtConfig__Secret + ASPNETCORE_ResourcesConfig__* (defaults from appsettings.json if unset) +EOF +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && { usage; exit 0; } + +require_env \ + REGISTRY_HOST REGISTRY_IMAGE REGISTRY_TAG \ + DEPLOY_CONTAINER_NAME DEPLOY_HOST_PORT \ + DEPLOY_HOST_CONTENT_DIR DEPLOY_HOST_LOGS_DIR \ + ASPNETCORE_ConnectionStrings__AzaionDb \ + ASPNETCORE_ConnectionStrings__AzaionDbAdmin \ + ASPNETCORE_JwtConfig__Secret +require_cmd docker + +IMAGE="$REGISTRY_HOST/$REGISTRY_IMAGE:$REGISTRY_TAG" + +# Materialize an env file for `docker run --env-file`. We pass only the +# ASPNETCORE_* + AZAION_* variables — registry / deploy host vars stay on the +# host, never in the container. +ENV_FILE="$(mktemp -t azaion-runtime-env.XXXXXX)" +chmod 600 "$ENV_FILE" +trap 'rm -f "$ENV_FILE"' EXIT INT TERM + +env | grep -E '^(ASPNETCORE_|AZAION_)' > "$ENV_FILE" || true + +mkdir -p "$DEPLOY_HOST_CONTENT_DIR" "$DEPLOY_HOST_LOGS_DIR" + +log_info "Starting $DEPLOY_CONTAINER_NAME from $IMAGE on host port $DEPLOY_HOST_PORT" +docker run --detach \ + --name "$DEPLOY_CONTAINER_NAME" \ + --restart unless-stopped \ + --env-file "$ENV_FILE" \ + --publish "$DEPLOY_HOST_PORT:8080" \ + --volume "$DEPLOY_HOST_CONTENT_DIR:/app/Content" \ + --volume "$DEPLOY_HOST_LOGS_DIR:/app/logs" \ + "$IMAGE" >/dev/null + +log_info "Container ID: $(docker container inspect -f '{{.Id}}' "$DEPLOY_CONTAINER_NAME" | cut -c1-12)" +log_info "Running revision label: $(current_image_revision "$DEPLOY_CONTAINER_NAME")" diff --git a/scripts/stop-services.sh b/scripts/stop-services.sh new file mode 100755 index 0000000..4909fe8 --- /dev/null +++ b/scripts/stop-services.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# scripts/stop-services.sh — graceful stop + record the previous image SHA so +# `./scripts/deploy.sh --rollback` can find a target. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +. "$SCRIPT_DIR/_lib.sh" + +usage() { + cat <<'EOF' +Usage: ./scripts/stop-services.sh [--help] + +Reads from the environment: + DEPLOY_CONTAINER_NAME name of the container to stop + REGISTRY_TAG (optional, for logging only) + +Side-effect: writes scripts/.previous_tags.env containing PREVIOUS_SHA_TAG so +the next deploy can roll back to whatever was running just before this stop. +EOF +} + +[[ "${1:-}" == "--help" || "${1:-}" == "-h" ]] && { usage; exit 0; } + +require_env DEPLOY_CONTAINER_NAME +require_cmd docker + +PREV_FILE="$SCRIPT_DIR/.previous_tags.env" + +if container_exists "$DEPLOY_CONTAINER_NAME"; then + REVISION="$(current_image_revision "$DEPLOY_CONTAINER_NAME")" + if [[ -n "$REVISION" ]]; then + SHA12="$(echo "$REVISION" | cut -c1-12)" + SUFFIX="${REGISTRY_TAG##*-}" # arm / amd; falls back to whatever follows the last dash + PREV_TAG="${SHA12}-${SUFFIX:-arm}" + printf 'PREVIOUS_SHA_TAG=%s\nPREVIOUS_REVISION=%s\nRECORDED_AT=%s\n' \ + "$PREV_TAG" "$REVISION" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "$PREV_FILE" + log_info "Recorded rollback target → $PREV_TAG (revision $REVISION) in $PREV_FILE" + else + log_warn "Could not read org.opencontainers.image.revision from $DEPLOY_CONTAINER_NAME — rollback target NOT recorded" + fi + + if container_running "$DEPLOY_CONTAINER_NAME"; then + # 40s matches the grace period in deployment_procedures.md §1; the app + # itself shuts down after ShutdownTimeout=30s leaving 10s headroom. + log_info "Stopping $DEPLOY_CONTAINER_NAME (grace 40s)" + docker stop -t 40 "$DEPLOY_CONTAINER_NAME" + else + log_info "$DEPLOY_CONTAINER_NAME exists but is not running" + fi + log_info "Removing container $DEPLOY_CONTAINER_NAME" + docker rm -f "$DEPLOY_CONTAINER_NAME" +else + log_info "$DEPLOY_CONTAINER_NAME does not exist — nothing to stop" +fi diff --git a/secrets/.sops.yaml b/secrets/.sops.yaml new file mode 100644 index 0000000..1ed78fc --- /dev/null +++ b/secrets/.sops.yaml @@ -0,0 +1,28 @@ +# sops creation rules — see https://github.com/getsops/sops +# +# Each rule routes a path-regex to one or more age recipients (public keys). +# The matching age PRIVATE key lives outside the repo at /etc/azaion/age.key +# on the deploy host and is consumed by `scripts/deploy.sh` via +# `SOPS_AGE_KEY_FILE`. +# +# Onboarding a new operator: +# 1. They generate `age-keygen -o ~/.config/sops/age/keys.txt` +# 2. Their public key is appended below as an additional age recipient on the +# relevant rule. +# 3. Run `sops updatekeys secrets/.env` to re-encrypt the file with the +# new recipient list. +# 4. Commit the updated `.sops.yaml` AND the updated encrypted file in the +# same commit. NEVER commit the private key. +# +# Cycle 1 placeholder: the recipient values below are the literal string +# `REPLACE_WITH_AGE_PUBLIC_KEY` so the file is reviewable but no real key is +# leaked. The first deploy MUST replace these before encrypting any real +# secret. The deploy script will fail loudly if it cannot decrypt. + +creation_rules: + - path_regex: secrets/staging\.env$ + age: + - REPLACE_WITH_AGE_PUBLIC_KEY_FOR_STAGING + - path_regex: secrets/production\.env$ + age: + - REPLACE_WITH_AGE_PUBLIC_KEY_FOR_PRODUCTION diff --git a/secrets/README.md b/secrets/README.md new file mode 100644 index 0000000..3e12122 --- /dev/null +++ b/secrets/README.md @@ -0,0 +1,57 @@ +# `secrets/` — sops + age secret material + +This folder holds **per-environment** runtime configuration for the Admin API. + +| File | Tracked | Encrypted | Loaded by | +|------|---------|-----------|-----------| +| `.sops.yaml` | yes | n/a | sops itself (resolves recipients) | +| `staging.public.env` | yes | no | `scripts/_lib.sh` → `set -a; .` (loaded BEFORE the encrypted overlay) | +| `production.public.env` | yes | no | same | +| `staging.env` | yes (after first encryption) | **yes** (sops + age) | `scripts/deploy.sh` decrypts to a tempfile then sources it | +| `production.env` | yes (after first encryption) | **yes** (sops + age) | same | +| age private key | **never tracked** | n/a | lives at `/etc/azaion/age.key` on the deploy host (mode 0400) | + +## First-time bootstrap on a fresh host + +```bash +# 1. Install sops + age on the host +sudo apt-get install -y sops age + +# 2. Generate the host's age keypair +sudo install -d -m 0700 /etc/azaion +sudo age-keygen -o /etc/azaion/age.key +sudo chmod 0400 /etc/azaion/age.key +sudo grep '^# public key:' /etc/azaion/age.key +# → copy the public key string + +# 3. On a developer machine, replace the placeholder in `secrets/.sops.yaml` +# with the public key from step 2 (for the matching environment), then +# encrypt the env file: +# sops --encrypt --age secrets/staging.env > secrets/staging.enc.tmp +# mv secrets/staging.enc.tmp secrets/staging.env +# Commit `.sops.yaml` and the encrypted file together. + +# 4. Sanity-check on the host: +SOPS_AGE_KEY_FILE=/etc/azaion/age.key sops -d secrets/staging.env | head +``` + +## Rotation + +See `_docs/04_deploy/environment_strategy.md` §3 for the per-secret rotation cadence and procedure. + +## What goes where + +- **Public env (staging.public.env / production.public.env)** — anything that is NOT a secret: hostname, port, container name, JWT issuer/audience, resource folder names. Reviewable in PRs. +- **Encrypted env (staging.env / production.env)** — DB connection strings (with passwords), `JwtConfig__Secret`, `REGISTRY_USER`, `REGISTRY_TOKEN`, anything else sensitive. NEVER readable in plain text outside the host. + +## Schema (variables that MUST be in the encrypted file) + +``` +ASPNETCORE_ConnectionStrings__AzaionDb=Host=...;Port=4312;Database=azaion;Username=azaion_reader;Password=... +ASPNETCORE_ConnectionStrings__AzaionDbAdmin=Host=...;Port=4312;Database=azaion;Username=azaion_admin;Password=... +ASPNETCORE_JwtConfig__Secret=<>= 32 random bytes> +REGISTRY_USER= +REGISTRY_TOKEN= +``` + +The deploy script will fail-fast if any of the first three are missing once the container starts. diff --git a/secrets/production.public.env b/secrets/production.public.env new file mode 100644 index 0000000..315d693 --- /dev/null +++ b/secrets/production.public.env @@ -0,0 +1,20 @@ +# Plain-text overlay for production — committed; safe to read. +# Loaded BEFORE the sops-decrypted overlay; secret values stay encrypted. + +ASPNETCORE_ENVIRONMENT=Production +ASPNETCORE_URLS=http://+:8080 + +ASPNETCORE_JwtConfig__Issuer=AzaionApi +ASPNETCORE_JwtConfig__Audience=Annotators/OrangePi/Admins +ASPNETCORE_JwtConfig__TokenLifetimeHours=4 +ASPNETCORE_ResourcesConfig__ResourcesFolder=Content +ASPNETCORE_ResourcesConfig__SuiteInstallerFolder=suite +ASPNETCORE_ResourcesConfig__SuiteStageInstallerFolder=suite-stage + +DEPLOY_CONTAINER_NAME=azaion.api +DEPLOY_HOST_PORT=4000 +DEPLOY_HOST_CONTENT_DIR=/root/api/content +DEPLOY_HOST_LOGS_DIR=/root/api/logs + +REGISTRY_HOST=docker.azaion.com +REGISTRY_IMAGE=azaion/admin diff --git a/secrets/staging.public.env b/secrets/staging.public.env new file mode 100644 index 0000000..9851dff --- /dev/null +++ b/secrets/staging.public.env @@ -0,0 +1,23 @@ +# Plain-text overlay for staging — committed; safe to read. +# Loaded BEFORE the sops-decrypted overlay; secret values stay encrypted. + +ASPNETCORE_ENVIRONMENT=Staging +ASPNETCORE_URLS=http://+:8080 + +# Idempotent appsettings overrides — these match production for parity. +ASPNETCORE_JwtConfig__Issuer=AzaionApi +ASPNETCORE_JwtConfig__Audience=Annotators/OrangePi/Admins +ASPNETCORE_JwtConfig__TokenLifetimeHours=4 +ASPNETCORE_ResourcesConfig__ResourcesFolder=Content +ASPNETCORE_ResourcesConfig__SuiteInstallerFolder=suite +ASPNETCORE_ResourcesConfig__SuiteStageInstallerFolder=suite-stage + +# Deploy-host plumbing. +DEPLOY_CONTAINER_NAME=azaion.api +DEPLOY_HOST_PORT=4000 +DEPLOY_HOST_CONTENT_DIR=/root/api/content +DEPLOY_HOST_LOGS_DIR=/root/api/logs + +# Registry. REGISTRY_USER / REGISTRY_TOKEN come from the encrypted overlay. +REGISTRY_HOST=docker.azaion.com +REGISTRY_IMAGE=azaion/admin