From e87c33b0eebd05bcf450b81c9efa5b9401c898f3 Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Mon, 10 Nov 2025 20:26:40 +0200 Subject: [PATCH] went through 4 iterations of solution draft. Right now it is more or less consistent and reliable --- .cursor/commands/gen_components.md | 17 + .cursor/commands/gen_epics.md | 8 + .cursor/commands/gen_tests.md | 14 + docs/00_problem/1.3_research_prompt.md | 64 + .../1.4_01_assesment_prompt copy.md | 329 +++++ .../1.4_02_assesment_prompt copy.md | 325 +++++ docs/00_problem/1.4_03_assesment_prompt.md | 301 +++++ docs/00_problem/acceptance_criteria.md | 28 +- docs/00_problem/problem_description.md | 6 +- docs/00_problem/restrictions.md | 13 +- docs/01_solution/01_solution_draft.md | 288 +++++ docs/01_solution/01_solution_draft_claude.md | 559 -------- docs/01_solution/01_solution_draft_google.md | 271 ---- .../01_solution_draft_perplexity.md | 1125 ----------------- docs/01_solution/02_solution_draft.md | 284 +++++ docs/01_solution/03_solution_draft.md | 259 ++++ docs/01_solution/04_solution_draft.md | 327 +++++ docs/_metodology/01_research_phase.md | 147 +-- docs/_metodology/02_planning_phase.md | 5 + ...pment_phase.md => 03_development_phase.md} | 8 +- ...oring_phase.md => 04_refactoring_phase.md} | 0 21 files changed, 2323 insertions(+), 2055 deletions(-) create mode 100644 .cursor/commands/gen_components.md create mode 100644 .cursor/commands/gen_epics.md create mode 100644 .cursor/commands/gen_tests.md create mode 100644 docs/00_problem/1.3_research_prompt.md create mode 100644 docs/00_problem/1.4_01_assesment_prompt copy.md create mode 100644 docs/00_problem/1.4_02_assesment_prompt copy.md create mode 100644 docs/00_problem/1.4_03_assesment_prompt.md create mode 100644 docs/01_solution/01_solution_draft.md delete mode 100644 docs/01_solution/01_solution_draft_claude.md delete mode 100644 docs/01_solution/01_solution_draft_google.md delete mode 100644 docs/01_solution/01_solution_draft_perplexity.md create mode 100644 docs/01_solution/02_solution_draft.md create mode 100644 docs/01_solution/03_solution_draft.md create mode 100644 docs/01_solution/04_solution_draft.md create mode 100644 docs/_metodology/02_planning_phase.md rename docs/_metodology/{02_development_phase.md => 03_development_phase.md} (91%) rename docs/_metodology/{03_refactoring_phase.md => 04_refactoring_phase.md} (100%) diff --git a/.cursor/commands/gen_components.md b/.cursor/commands/gen_components.md new file mode 100644 index 0000000..532778c --- /dev/null +++ b/.cursor/commands/gen_components.md @@ -0,0 +1,17 @@ +# decompose + + Decompose the `@docs/01_solution/solution.md` to the components. + Store description of each component to the file `docs/02_components/[##]_[component_name]/spec.md` with the next structure: + - Component Name + - Detailed description + - API methods, for each method: + - Name + - Input + - Output + - Description + - Test cases for the method + - Integration tests for the component if needed. + - Non-functional tests for the component if needed. + + Generate draw.io components diagram shows relations between components. + Do not put any code yet, only names, input and output. Ask as many questions as possible to clarify all uncertainties. diff --git a/.cursor/commands/gen_epics.md b/.cursor/commands/gen_epics.md new file mode 100644 index 0000000..47e8eae --- /dev/null +++ b/.cursor/commands/gen_epics.md @@ -0,0 +1,8 @@ +# generate Jira Epics + +Read the solution spec `@docs/01_solution/solution.md` +Read description of all the components in the folder `@docs/02_components` - go to each folder and read /[component_name]/spec.md +Read the acceptance criteria from `@docs/00_initial/acceptance_criteria.md` + - Generate Jira Epics from the Components + - Ensure each epic has clear goal and acceptance criteria, verify it with acceptance criteria + - Generate draw.io components diagram based on previous diagram shows relations between components and Jira Epic numbers corresponding to each component. \ No newline at end of file diff --git a/.cursor/commands/gen_tests.md b/.cursor/commands/gen_tests.md new file mode 100644 index 0000000..944cbe5 --- /dev/null +++ b/.cursor/commands/gen_tests.md @@ -0,0 +1,14 @@ +# generate Tests + +Read the `@docs/01_solution/solution.md` and `@docs/00_problem/acceptance_criteria.md` and compose tests according to test strategy to cover all the criteria and store them to the files + `docs/03_tests/[##]_[test_name]_spec.md` with the next structure for each test file: + - Summary + - Detailed description + - Preconditions for tests + - Steps: + - Step1 - Expected result1 + - Step2 - Expected result2 + ... + - StepN - Expected resultN + + Do not put any code yet. Ask as many questions as needed. \ No newline at end of file diff --git a/docs/00_problem/1.3_research_prompt.md b/docs/00_problem/1.3_research_prompt.md new file mode 100644 index 0000000..cc48fda --- /dev/null +++ b/docs/00_problem/1.3_research_prompt.md @@ -0,0 +1,64 @@ +Research this problem: + +We have a lot of images taken from a wing-type UAV using a camera with at least Full HD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flights, it could be FullHD + +Photos are taken and named consecutively within 100 meters of each other. + +We know only the starting GPS coordinates. We need to determine the GPS of the centers of each image. And also the coordinates of the center of any object in these photos. We can use an external satellite provider for ground checks on the existing photos + +The system should process data samples in the attached files (if any). They are for reference only. + - We have the next restrictions: + - Photos are taken by only airplane type UAVs. + - Photos are taken by the camera pointing downwards and fixed, but it is not autostabilized. + - The flying range is restricted by the eastern and southern parts of Ukraine (To the left of the Dnipro River) + - The image resolution could be from FullHD to 6252*4168 + - Altitude is predefined and no more than 1km + - There is NO data from IMU + - Flights are done mostly in sunny weather + - We can use satellite providers, but we're limited right now to Google Maps, which could be outdated for some regions + - Number of photos could be up to 3000, usually in the 500-1500 range + - During the flight, UAVs can make sharp turns, so that the next photo may be absolutely different from the previous one (no same objects), but it is rather an exception than the rule + - Processing is done on a stationary computer or laptop with NVidia GPU at least RTX2060, better 3070. (For the UAV solution Jetson Orin Nano would be used, but that is out of scope.) + + - Output of our system should meet these acceptance criteria: + - The system should find out the GPS of centers of 80% of the photos from the flight within an error of no more than 50 meters in comparison to the real GPS + + - The system should find out the GPS of centers of 60% of the photos from the flight within an error of no more than 20 meters in comparison to the real GPS + + - The system should correctly continue the work even in the presence of up to 350 meters of an outlier photo between 2 consecutive pictures en route. This could happen due to tilt of the plane. + + - System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. The next photo should be in less than 150m drift and at an angle of less than 50% + + - The number of outliers during the satellite provider images ground check should be less than 10% + + - In case of being absolutely incapable of determining the system to determine next, second next, and third next images GPS, by any means (these 20% of the route), then it should ask the user for input for the next image, so that the user can specify the location + + - Less than 5 seconds for processing one image + + - Results of image processing should appear immediately to user, so that user shouldn't wait for the whole route to complete in order to analyze first results. Also, system could refine existing calculated results and send refined results again to user + + - Image Registration Rate > 95%. The system can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory + + - Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. + + + - Find out all the state-of-the-art solutions for this problem and produce the resulting solution draft in the next format: + + - Short Product solution description. Brief component interaction diagram. + + - Architecture approach that meets restrictions and acceptance criteria. For each component, analyze the best possible approaches to solve, and form a table comprising all approaches. Each new approach would be a row, and has the next columns: + + - Tools (library, platform) to solve component tasks + + - Advantages of this approach + + - Limitations of this approach + + - Requirements for this approach + + - How does it fit for the problem component that has to be solved, and the whole solution + + - Testing strategy. Research the best approaches to cover all the acceptance criteria. Form a list of integration functional tests and non-functional tests. + + +Be concise in formulating. The fewer words, the better, but do not miss any important details. \ No newline at end of file diff --git a/docs/00_problem/1.4_01_assesment_prompt copy.md b/docs/00_problem/1.4_01_assesment_prompt copy.md new file mode 100644 index 0000000..b0f219b --- /dev/null +++ b/docs/00_problem/1.4_01_assesment_prompt copy.md @@ -0,0 +1,329 @@ +Read carefully about the problem: + + We have a lot of images taken from a wing-type UAV using a camera with at least Full HD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flights, it could be FullHD +Photos are taken and named consecutively within 100 meters of each other. +We know only the starting GPS coordinates. We need to determine the GPS of the centers of each image. And also the coordinates of the center of any object in these photos. We can use an external satellite provider for ground checks on the existing photos + + System has next restrictions and conditions: + - Photos are taken by only airplane type UAVs. + - Photos are taken by the camera pointing downwards and fixed, but it is not autostabilized. + - The flying range is restricted by the eastern and southern parts of Ukraine (To the left of the Dnipro River) + - The image resolution could be from FullHD to 6252*4168. Camera parameters are known: focal length, sensor width, resolution and so on. + - Altitude is predefined and no more than 1km. The height of the terrain can be neglected. + - There is NO data from IMU + - Flights are done mostly in sunny weather + - We can use satellite providers, but we're limited right now to Google Maps, which could be outdated for some regions + - Number of photos could be up to 3000, usually in the 500-1500 range + - During the flight, UAVs can make sharp turns, so that the next photo may be absolutely different from the previous one (no same objects), but it is rather an exception than the rule + - Processing is done on a stationary computer or laptop with NVidia GPU at least RTX2060, better 3070. (For the UAV solution Jetson Orin Nano would be used, but that is out of scope.) + + Output of the system should address next acceptance criteria: + - The system should find out the GPS of centers of 80% of the photos from the flight within an error of no more than 50 meters in comparison to the real GPS + - The system should find out the GPS of centers of 60% of the photos from the flight within an error of no more than 20 meters in comparison to the real GPS + - The system should correctly continue the work even in the presence of up to 350 meters of an outlier photo between 2 consecutive pictures en route. This could happen due to tilt of the plane. + - System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. The next photo should be in less than 150m drift and at an angle of less than 50% + - The number of outliers during the satellite provider images ground check should be less than 10% + - In case of being absolutely incapable of determining the system to determine next, second next, and third next images GPS, by any means (these 20% of the route), then it should ask the user for input for the next image, so that the user can specify the location + - Less than 5 seconds for processing one image + - Results of image processing should appear immediately to user, so that user shouldn't wait for the whole route to complete in order to analyze first results. Also, system could refine existing calculated results and send refined results again to user + - Image Registration Rate > 95%. The system can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory + - Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. + + Here is a solution draft: + +# **GEo-Referenced Trajectory and Object Localization System (GEORTOLS): A Hybrid SLAM Architecture** + +## **1. Executive Summary** + +This report outlines the technical design for a robust, real-time geolocalization system. The objective is to determine the precise GPS coordinates for a sequence of high-resolution images (up to 6252x4168) captured by a fixed-wing, non-stabilized Unmanned Aerial Vehicle (UAV) [User Query]. The system must operate under severe constraints, including the absence of any IMU data, a predefined altitude of no more than 1km, and knowledge of only the starting GPS coordinate [User Query]. The system is required to handle significant in-flight challenges, such as sharp turns with minimal image overlap (<5%), frame-to-frame outliers of up to 350 meters, and operation over low-texture terrain as seen in the provided sample images [User Query, Image 1, Image 7]. + +The proposed solution is a **Hybrid Visual-Geolocalization SLAM (VG-SLAM)** architecture. This system is designed to meet the demanding acceptance criteria, including a sub-5-second initial processing time per image, streaming output with asynchronous refinement, and high-accuracy GPS localization (60% of photos within 20m error, 80% within 50m error) [User Query]. + +This hybrid architecture is necessitated by the problem's core constraints. The lack of an IMU makes a purely monocular Visual Odometry (VO) system susceptible to catastrophic scale drift.1 Therefore, the system integrates two cooperative sub-systems: + +1. A **Visual Odometry (VO) Front-End:** This component uses state-of-the-art deep-learning feature matchers (SuperPoint + SuperGlue/LightGlue) to provide fast, real-time *relative* pose estimates. This approach is selected for its proven robustness in low-texture environments where traditional features fail.4 This component delivers the initial, sub-5-second pose estimate. +2. A **Cross-View Geolocalization (CVGL) Module:** This component provides *absolute*, drift-free GPS pose estimates by matching UAV images against the available satellite provider (Google Maps).7 It functions as the system's "global loop closure" mechanism, correcting the VO's scale drift and, critically, relocalizing the UAV after tracking is lost during sharp turns or outlier frames [User Query]. + +These two systems run in parallel. A **Back-End Pose-Graph Optimizer** fuses their respective measurements—high-frequency relative poses from VO and high-confidence absolute poses from CVGL—into a single, globally consistent, and incrementally refined trajectory. This architecture directly satisfies the requirements for immediate, streaming results and subsequent asynchronous refinement [User Query]. + +## **2. Product Solution Description and Component Interaction** + +### **Product Solution Description** + +The proposed system, "GEo-Referenced Trajectory and Object Localization System (GEORTOLS)," is a real-time, streaming-capable software solution. It is designed for deployment on a stationary computer or laptop equipped with an NVIDIA GPU (RTX 2060 or better) [User Query]. + +* **Inputs:** + 1. A sequence of consecutively named monocular images (FullHD to 6252x4168). + 2. The absolute GPS coordinate (Latitude, Longitude) of the *first* image in the sequence. + 3. A pre-calibrated camera intrinsic matrix. + 4. Access to the Google Maps satellite imagery API. +* **Outputs:** + 1. A real-time, streaming feed of estimated GPS coordinates (Latitude, Longitude, Altitude) and 6-DoF poses (including Roll, Pitch, Yaw) for the center of each image. + 2. Asynchronous refinement messages for previously computed poses as the back-end optimizer improves the global trajectory. + 3. A service to provide the absolute GPS coordinate for any user-selected pixel coordinate (u,v) within any geolocated image. + +### **Component Interaction Diagram** + +The system is architected as four asynchronous, parallel-processing components to meet the stringent real-time and refinement requirements. + +1. **Image Ingestion & Pre-processing:** This module acts as the entry point. It receives the new, high-resolution image (Image N). It immediately creates scaled-down, lower-resolution (e.g., 1024x768) copies of the image for real-time processing by the VO and CVGL modules, while retaining the full-resolution original for object-level GPS lookups. +2. **Visual Odometry (VO) Front-End:** This module's sole task is high-speed, frame-to-frame relative pose estimation. It maintains a short-term "sliding window" of features, matching Image N to Image N-1. It uses GPU-accelerated deep-learning models (SuperPoint + SuperGlue) to find feature matches and calculates the 6-DoF relative transform. This result is immediately sent to the Back-End. +3. **Cross-View Geolocalization (CVGL) Module:** This is a heavier, slower, asynchronous module. It takes the pre-processed Image N and queries the Google Maps database to find an *absolute* GPS pose. This involves a two-stage retrieval-and-match process. When a high-confidence match is found, its absolute pose is sent to the Back-End as a "global-pose constraint." +4. **Trajectory Optimization Back-End:** This is the system's central "brain," managing the complete pose graph.10 It receives two types of data: + * *High-frequency, low-confidence relative poses* from the VO Front-End. + * Low-frequency, high-confidence absolute poses from the CVGL Module. + It continuously fuses these constraints in a pose-graph optimization framework (e.g., g2o or Ceres Solver). When the VO Front-End provides a new relative pose, it is quickly added to the graph to produce the "Initial Pose" (<5s). When the CVGL Module provides a new absolute pose, it triggers a more comprehensive re-optimization of the entire graph, correcting drift and broadcasting "Refined Poses" to the user.11 + +## **3. Core Architectural Framework: Hybrid Visual-Geolocalization SLAM (VG-SLAM)** + +### **Rationale for the Hybrid Approach** + +The core constraints of this problem—monocular, IMU-less flight over potentially long distances (up to 3000 images at \~100m intervals equates to a 300km flight) [User Query]—render simple solutions unviable. + +A **VO-Only** system is guaranteed to fail. Monocular Visual Odometry (and SLAM) suffers from an inherent, unobservable ambiguity: the *scale* of the world.1 Because there is no IMU to provide an accelerometer-based scale reference or a gravity vector 12, the system has no way to know if it moved 1 meter or 10 meters. This leads to compounding scale drift, where the entire trajectory will grow or shrink over time.3 Over a 300km flight, the resulting positional error would be measured in kilometers, not the 20-50 meters required [User Query]. + +A **CVGL-Only** system is also unviable. Cross-View Geolocalization (CVGL) matches the UAV image to a satellite map to find an absolute pose.7 While this is drift-free, it is a large-scale image retrieval problem. Querying the entire map of Ukraine for a match for every single frame is computationally impossible within the <5 second time limit.13 Furthermore, this approach is brittle; if the Google Maps data is outdated (a specific user restriction) [User Query], the CVGL match will fail, and the system would have no pose estimate at all. + +Therefore, the **Hybrid VG-SLAM** architecture is the only robust solution. + +* The **VO Front-End** provides the fast, high-frequency relative motion. It works even if the satellite map is outdated, as it tracks features in the *real*, current world. +* The **CVGL Module** acts as the *only* mechanism for scale correction and absolute georeferencing. It provides periodic, drift-free "anchors" to the real-world GPS coordinates. +* The **Back-End Optimizer** fuses these two data streams. The CVGL poses function as "global loop closures" in the SLAM pose graph. They correct the scale drift accumulated by the VO and, critically, serve to relocalize the system after a "kidnapping" event, such as the specified sharp turns or 350m outliers [User Query]. + +### **Data Flow for Streaming and Refinement** + +This architecture is explicitly designed to meet the <5s initial output and asynchronous refinement criteria [User Query]. The data flow for a single image (Image N) is as follows: + +* **T \= 0.0s:** Image N (6200x4100) is received by the **Ingestion Module**. +* **T \= 0.2s:** Image N is pre-processed (scaled to 1024px) and passed to the VO and CVGL modules. +* **T \= 1.0s:** The **VO Front-End** completes GPU-accelerated matching (SuperPoint+SuperGlue) of Image N -> Image N-1. It computes the Relative_Pose(N-1 -> N). +* **T \= 1.1s:** The **Back-End Optimizer** receives this Relative_Pose. It appends this pose to the graph relative to the last known pose of N-1. +* **T \= 1.2s:** The Back-End broadcasts the **Initial Pose_N_Est** to the user interface. (**<5s criterion met**). +* **(Parallel Thread) T \= 1.5s:** The **CVGL Module** (on a separate thread) begins its two-stage search for Image N against the Google Maps database. +* **(Parallel Thread) T \= 6.0s:** The CVGL Module successfully finds a high-confidence Absolute_Pose_N_Abs from the satellite match. +* **T \= 6.1s:** The **Back-End Optimizer** receives this new, high-confidence absolute constraint for Image N. +* **T \= 6.2s:** The Back-End triggers a graph re-optimization. This new "anchor" corrects any scale or positional drift for Image N and all surrounding poses in the graph. +* **T \= 6.3s:** The Back-End broadcasts a **Pose_N_Refined** (and Pose_N-1_Refined, Pose_N-2_Refined, etc.) to the user interface. (**Refinement criterion met**). + +## **4. Component Analysis: Front-End (Visual Odometry and Relocalization)** + +The task of the VO Front-End is to rapidly and robustly estimate the 6-DoF relative motion between consecutive frames. This component's success is paramount for the high-frequency tracking required to meet the <5s criterion. + +The primary challenge is the nature of the imagery. The specified operational area and sample images (e.g., Image 1, Image 7) show vast, low-texture agricultural fields [User Query]. These environments are a known failure case for traditional, gradient-based feature extractors like SIFT or ORB, which rely on high-gradient corners and cannot find stable features in "weak texture areas".5 Furthermore, the non-stabilized camera [User Query] will introduce significant rotational motion and viewpoint change, breaking the assumptions of many simple trackers.16 + +Deep-learning (DL) based feature extractors and matchers have been developed specifically to overcome these "challenging visual conditions".5 Models like SuperPoint, SuperGlue, and LoFTR are trained to find more robust and repeatable features, even in low-texture scenes.4 + +### **Table 1: Analysis of State-of-the-Art Feature Extraction and Matching Techniques** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **SIFT + BFMatcher/FLANN** (OpenCV) | - Scale and rotation invariant. - High-quality, robust matches. - Well-studied and mature.15 | - Computationally slow (CPU-based). - Poor performance in low-texture or weakly-textured areas.14 - Patented (though expired). | - High-contrast, well-defined features. | **Poor.** Too slow for the <5s target and will fail to find features in the low-texture agricultural landscapes shown in sample images. | +| **ORB + BFMatcher** (OpenCV) | - Extremely fast and lightweight. - Standard for real-time SLAM (e.g., ORB-SLAM).21 - Rotation invariant. | - *Not* scale invariant (uses a pyramid). - Performs very poorly in low-texture scenes.5 - Unstable in high-blur scenarios. | - CPU, lightweight. - High-gradient corners. | **Very Poor.** While fast, it fails on the *robustness* requirement. It is designed for textured, indoor/urban scenes, not sparse, natural terrain. | +| **SuperPoint + SuperGlue** (PyTorch, C++/TensorRT) | - SOTA robustness in low-texture, high-blur, and challenging conditions.4 - End-to-end learning for detection and matching.24 - Multiple open-source SLAM integrations exist (e.g., SuperSLAM).25 | - Requires a powerful GPU for real-time performance. - Sparse feature-based (not dense). | - NVIDIA GPU (RTX 2060+). - PyTorch (research) or TensorRT (deployment).26 | **Excellent.** This approach is *designed* for the exact "challenging conditions" of this problem. It provides SOTA robustness in low-texture scenes.4 The user's hardware (RTX 2060+) meets the requirements. | +| **LoFTR** (PyTorch) | - Detector-free dense matching.14 - Extremely robust to viewpoint and texture challenges.14 - Excellent performance on natural terrain and low-overlap images.19 | - High computational and VRAM cost. - Can cause CUDA Out-of-Memory (OOM) errors on very high-resolution images.30 - Slower than sparse-feature methods. | - High-end NVIDIA GPU. - PyTorch. | **Good, but Risky.** While its robustness is excellent, its dense, Transformer-based nature makes it vulnerable to OOM errors on the 6252x4168 images.30 The sparse SuperPoint approach is a safer, more-scalable choice for the VO front-end. | + +### **Selected Approach (VO Front-End): SuperPoint + SuperGlue/LightGlue** + +The selected approach is a VO front-end based on **SuperPoint** for feature extraction and **SuperGlue** (or its faster successor, **LightGlue**) for matching.18 + +* **Robustness:** This combination is proven to provide superior robustness and accuracy in sparse-texture scenes, extracting more and higher-quality matches than ORB.4 +* **Performance:** It is designed for GPU acceleration and is used in SOTA real-time SLAM systems, demonstrating its feasibility within the <5s target on an RTX 2060.25 +* **Scalability:** As a sparse-feature method, it avoids the memory-scaling issues of dense matchers like LoFTR when faced with the user's maximum 6252x4168 resolution.30 The image can be downscaled for real-time VO, and SuperPoint will still find stable features. + +## **5. Component Analysis: Back-End (Trajectory Optimization and Refinement)** + +The task of the Back-End is to fuse all incoming measurements (high-frequency/low-accuracy relative VO poses, low-frequency/high-accuracy absolute CVGL poses) into a single, globally consistent trajectory. This component's design is dictated by the user's real-time streaming and refinement requirements [User Query]. + +A critical architectural choice must be made between a traditional, batch **Structure from Motion (SfM)** pipeline and a real-time **SLAM (Simultaneous Localization and Mapping)** pipeline. + +* **Batch SfM:** (e.g., COLMAP).32 This approach is an offline process. It collects all 1500-3000 images, performs feature matching, and then runs a large, non-real-time "Bundle Adjustment" (BA) to solve for all camera poses and 3D points simultaneously.35 While this produces the most accurate possible result, it can take hours to compute. It *cannot* meet the <5s/image or "immediate results" criteria. +* **Real-time SLAM:** (e.g., ORB-SLAM3).28 This approach is *online* and *incremental*. It maintains a "pose graph" of the trajectory.10 It provides an immediate pose estimate based on the VO front-end. When a new, high-quality measurement arrives (like a loop closure 37, or in our case, a CVGL fix), it triggers a fast re-optimization of the graph, publishing a *refined* result.11 + +The user's requirements for "results...appear immediately" and "system could refine existing calculated results" [User Query] are a textbook description of a real-time SLAM back-end. + +### **Table 2: Analysis of Trajectory Optimization Strategies** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Incremental SLAM (Pose-Graph Optimization)** (g2o, Ceres Solver, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates. - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (CVGL) data arrives.10 - Meets the <5s and streaming criteria. | - Initial estimate is less accurate than a full batch process. - Susceptible to drift *until* a loop closure (CVGL fix) is made. | - A graph optimization library (g2o, Ceres). - A robust cost function to reject outliers. | **Excellent.** This is the *only* architecture that satisfies the user's real-time streaming and asynchronous refinement constraints. | +| **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory.35 - Can import custom DL matches.38 | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails all timing and streaming criteria. | - All images must be available before processing starts. - High RAM and CPU. | **Unsuitable (for the *online* system).** This approach is ideal for an *optional, post-flight, high-accuracy* refinement, but it cannot be the primary system. | + +### **Selected Approach (Back-End): Incremental Pose-Graph Optimization (g2o/Ceres)** + +The system's back-end will be built as an **Incremental Pose-Graph Optimizer** using a library like **g2o** or **Ceres Solver**. This is the only way to meet the real-time streaming and refinement constraints [User Query]. + +The graph will contain: + +* **Nodes:** The 6-DoF pose of each camera frame. +* **Edges (Constraints):** + 1. **Odometry Edges:** Relative 6-DoF transforms from the VO Front-End (SuperPoint+SuperGlue). These are high-frequency but have accumulating drift/scale error. + 2. **Georeferencing Edges:** Absolute 6-DoF poses from the CVGL Module. These are low-frequency but are drift-free and provide the absolute scale. + 3. **Start-Point Edge:** A high-confidence absolute pose for Image 1, fixed to the user-provided start GPS. + +This architecture allows the system to provide an immediate estimate (from odometry) and then drastically improve its accuracy (correcting scale and drift) whenever a new georeferencing edge is added. + +## **6. Component Analysis: Global-Pose Correction (Georeferencing Module)** + +This module is the most critical component for meeting the accuracy requirements. Its task is to provide absolute GPS pose estimates by matching the UAV's nadir-pointing-but-non-stabilized images to the Google Maps satellite provider [User Query]. This is the only component that can correct the monocular scale drift. + +This task is known as **Cross-View Geolocalization (CVGL)**.7 It is extremely challenging due to the "domain gap" 44 between the two image sources: + +1. **Viewpoint:** The UAV is at low altitude (<1km) and non-nadir (due to fixed-wing tilt) 45, while the satellite is at a very high altitude and is perfectly nadir. +2. **Appearance:** The images come from different sensors, with different lighting (shadows), and at different times. The Google Maps data may be "outdated" [User Query], showing different seasons, vegetation, or man-made structures.47 + +A simple, brute-force feature match is computationally impossible. The solution is a **hierarchical, two-stage approach** that mimics SOTA research 7: + +* **Stage 1: Coarse Retrieval.** We cannot run expensive matching against the entire map. Instead, we treat this as an image retrieval problem. We use a Deep Learning model (e.g., a Siamese or Dual CNN trained on this task 50) to generate a compact "embedding vector" (a digital signature) for the UAV image. In an offline step, we pre-compute embeddings for *all* satellite map tiles in the operational area. The UAV image's embedding is then used to perform a very fast (e.g., FAISS library) similarity search against the satellite database, returning the Top-K most likely-matching satellite tiles. +* **Stage 2: Fine-Grained Pose.** *Only* for these Top-K candidates do we perform the heavy-duty feature matching. We use our selected **SuperPoint+SuperGlue** matcher 53 to find precise correspondences between the UAV image and the K satellite tiles. If a high-confidence geometric match (e.g., >50 inliers) is found, we can compute the precise 6-DoF pose of the UAV relative to that tile, thus yielding an absolute GPS coordinate. + +### **Table 3: Analysis of State-of-the-Art Cross-View Geolocalization (CVGL) Techniques** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Coarse Retrieval (Siamese/Dual CNNs)** (PyTorch, ResNet18) | - Extremely fast for retrieval (database lookup). - Learns features robust to seasonal and appearance changes.50 - Narrows search space from millions to a few. | - Does *not* provide a precise 6-DoF pose, only a "best match" tile. - Requires training on a dataset of matched UAV-satellite pairs. | - Pre-trained model (e.g., on ResNet18).52 - Pre-computed satellite embedding database. | **Essential (as Stage 1).** This is the only computationally feasible way to "find" the UAV on the map. | +| **Fine-Grained Feature Matching** (SuperPoint + SuperGlue) | - Provides a highly-accurate 6-Dof pose estimate.53 - Re-uses the same robust matcher from the VO Front-End.54 | - Too slow to run on the entire map. - *Requires* a good initial guess (from Stage 1) to be effective. | - NVIDIA GPU. - Top-K candidate tiles from Stage 1. | **Essential (as Stage 2).** This is the component that actually computes the precise GPS pose from the coarse candidates. | +| **End-to-End DL Models (Transformers)** (PFED, ReCOT, etc.) | - SOTA accuracy in recent benchmarks.13 - Can be highly efficient (e.g., PFED).13 - Can perform retrieval and pose estimation in one model. | - Often research-grade, not robustly open-sourced. - May be complex to train and deploy. - Less modular and harder to debug than the two-stage approach. | - Specific, complex model architectures.13 - Large-scale training datasets. | **Not Recommended (for initial build).** While powerful, these are less practical for a version 1 build. The two-stage approach is more modular, debuggable, and uses components already required by the VO system. | + +### **Selected Approach (CVGL Module): Hierarchical Retrieval + Matching** + +The CVGL module will be implemented as a two-stage hierarchical system: + +1. **Stage 1 (Coarse):** A **Siamese CNN** 52 (or similar model) generates an embedding for the UAV image. This embedding is used to retrieve the Top-5 most similar satellite tiles from a pre-computed database. +2. **Stage 2 (Fine):** The **SuperPoint+SuperGlue** matcher 53 is run between the UAV image and these 5 tiles. The match with the highest inlier count and lowest reprojection error is used to calculate the absolute 6-DoF pose, which is then sent to the Back-End optimizer. + +## **7. Addressing Critical Acceptance Criteria and Failure Modes** + +This hybrid architecture's logic is designed to handle the most difficult acceptance criteria [User Query] through a robust, multi-stage escalation process. + +### **Stage 1: Initial State (Normal Operation)** + +* **Condition:** VO(N-1 -> N) succeeds. +* **System Logic:** The **VO Front-End** provides the high-frequency relative pose. This is added to the graph, and the **Initial Pose** is sent to the user (<5s). +* **Resolution:** The **CVGL Module** runs asynchronously to provide a Refined Pose later, which corrects for scale drift. + +### **Stage 2: Transient Failure / Outlier Handling (AC-3)** + +* **Condition:** VO(N-1 -> N) fails (e.g., >350m jump, severe motion blur, low overlap) [User Query]. This triggers an immediate, high-priority CVGL(N) query. +* **System Logic:** + 1. If CVGL(N) *succeeds*, the system has conflicting data: a failed VO link and a successful CVGL pose. The **Back-End Optimizer** uses a robust kernel to reject the high-error VO link as an outlier and accepts the CVGL pose.56 The trajectory "jumps" to the correct location, and VO resumes from Image N+1. + 2. If CVGL(N) *also fails* (e.g., due to cloud cover or outdated map), the system assumes Image N is a single bad frame (an outlier). +* **Resolution (Frame Skipping):** The system buffers Image N and, upon receiving Image N+1, the **VO Front-End** attempts to "bridge the gap" by matching VO(N-1 -> N+1). + * **If successful,** a pose for N+1 is found. Image N is marked as a rejected outlier, and the system continues. + * **If VO(N-1 -> N+1) fails,** it repeats for VO(N-1 -> N+2). + * If this "bridging" fails for 3 consecutive frames, the system concludes it is not a transient outlier but a persistent tracking loss. This escalates to Stage 3. + +### **Stage 3: Persistent Tracking Loss / Sharp Turn Handling (AC-4)** + +* **Condition:** VO tracking is lost, and the "frame-skipping" in Stage 2 fails (e.g., a "sharp turn" with no overlap) [User Query]. +* **System Logic (Multi-Map "Chunking"):** The **Back-End Optimizer** declares a "Tracking Lost" state and creates a *new, independent map* ("Chunk 2"). + * The **VO Front-End** is re-initialized and begins populating this new chunk, tracking VO(N+3 -> N+4), VO(N+4 -> N+5), etc. This new chunk is internally consistent but has no absolute GPS position (it is "floating"). +* **Resolution (Asynchronous Relocalization):** + 1. The **CVGL Module** now runs asynchronously on all frames in this new "Chunk 2". + 2. Crucially, it uses the last known GPS coordinate from "Chunk 1" as a *search prior*, narrowing the satellite map search area to the vicinity. + 3. The system continues to build Chunk 2 until the CVGL module successfully finds a high-confidence Absolute_Pose for *any* frame in that chunk (e.g., for Image N+20). + 4. Once this single GPS "anchor" is found, the **Back-End Optimizer** performs a full graph optimization. It calculates the 7-DoF transformation (3D position, 3D rotation, and **scale**) to align all of Chunk 2 and merge it with Chunk 1. + 5. This "chunking" method robustly handles the "correctly continue the work" criterion by allowing the system to keep tracking locally even while globally lost, confident it can merge the maps later. + +### **Stage 4: Catastrophic Failure / User Intervention (AC-6)** + +* **Condition:** The system has entered Stage 3 and is building "Chunk 2," but the **CVGL Module** has *also* failed for a prolonged period (e.g., 20% of the route, or 50+ consecutive frames) [User Query]. This is a "worst-case" scenario where the UAV is in an area with no VO features (e.g., over a lake) *and* no CVGL features (e.g., heavy clouds or outdated maps). +* **System Logic:** The system is "absolutely incapable" of determining its pose. +* **Resolution (User Input):** The system triggers the "ask the user for input" event. A UI prompt will show the last known good image (from Chunk 1) on the map and the new, "lost" image (e.g., N+50). It will ask the user to "Click on the map to provide a coarse location." This user-provided GPS point is then fed to the CVGL module as a *strong prior*, drastically narrowing the search space and enabling it to re-acquire a lock. + +## **8. Implementation and Output Generation** + +### **Real-time Workflow (<5s Initial, Async Refinement)** + +A concrete implementation plan for processing Image N: + +1. **T=0.0s:** Image[N] (6200px) received. +2. **T=0.1s:** Image pre-processed: Scaled to 1024px for VO/CVGL. Full-res original stored. +3. **T=0.5s:** **VO Front-End** (GPU): SuperPoint features extracted for 1024px image. +4. **T=1.0s:** **VO Front-End** (GPU): SuperGlue matches 1024px Image[N] -> 1024px Image[N-1]. Relative_Pose (6-DoF) estimated via RANSAC/PnP. +5. **T=1.1s:** **Back-End:** Relative_Pose added to graph. Optimizer updates trajectory. +6. **T=1.2s:** **OUTPUT:** Initial Pose_N_Est (GPS) sent to user. **(<5s criterion met)**. +7. **T=1.3s:** **CVGL Module (Async Task)** (GPU): Siamese/Dual CNN generates embedding for 1024px Image[N]. +8. **T=1.5s:** **CVGL Module (Async Task):** Coarse retrieval (FAISS lookup) returns Top-5 satellite tile candidates. +9. **T=4.0s:** **CVGL Module (Async Task)** (GPU): Fine-grained matching. SuperPoint+SuperGlue runs 5 times (Image[N] vs. 5 satellite tiles). +10. **T=4.5s:** **CVGL Module (Async Task):** A high-confidence match is found. Absolute_Pose_N_Abs (6-DoF) is computed. +11. **T=4.6s:** **Back-End:** High-confidence Absolute_Pose_N_Abs added to pose graph. Graph re-optimization is triggered. +12. **T=4.8s:** **OUTPUT:** Pose_N_Refined (GPS) sent to user. **(Refinement criterion met)**. + +### **Determining Object-Level GPS (from Pixel Coordinate)** + +The requirement to find the "coordinates of the center of any object in these photos" [User Query] is met by projecting a pixel to its 3D world coordinate. This requires the (u,v) pixel, the camera's 6-DoF pose, and the camera's intrinsic matrix (K). + +Two methods will be implemented to support the streaming/refinement architecture: + +1. **Method 1 (Immediate, <5s): Flat-Earth Projection.** + * When the user clicks pixel (u,v) on Image[N], the system uses the *Initial Pose_N_Est*. + * It assumes the ground is a flat plane at the predefined altitude (e.g., 900m altitude if flying at 1km and ground is at 100m) [User Query]. + * It computes the 3D ray from the camera center through (u,v) using the intrinsic matrix (K). + * It calculates the 3D intersection point of this ray with the flat ground plane. + * This 3D world point is converted to a GPS coordinate and sent to the user. This is very fast but less accurate in non-flat terrain. +2. **Method 2 (Refined, Post-BA): Structure-from-Motion Projection.** + * The Back-End's pose-graph optimization, as a byproduct, will create a sparse 3D point cloud of the world (i.e., the "SfM" part of SLAM).35 + * When the user clicks (u,v), the system uses the *Pose_N_Refined*. + * It raycasts from the camera center through (u,v) and finds the 3D intersection point with the *actual 3D point cloud* generated by the system. + * This 3D point's coordinate (X,Y,Z) is converted to GPS. This is far more accurate as it accounts for real-world topography (hills, ditches) captured in the 3D map. + +## **9. Testing and Validation Strategy** + +A rigorous testing strategy is required to validate all 10 acceptance criteria. The foundation of this strategy is the creation of a **Ground-Truth Test Dataset**. This will involve flying several test routes and manually creating a "checkpoint" (CP) file, similar to the provided coordinates.csv 58, using a high-precision RTK/PPK GPS. This provides the "real GPS" for validation.59 + +### **Accuracy Validation Methodology (AC-1, AC-2, AC-5, AC-8, AC-9)** + +These tests validate the system's accuracy and completion metrics.59 + +1. A test flight of 1000 images with high-precision ground-truth CPs is prepared. +2. The system is run given only the first GPS coordinate. +3. A test script compares the system's *final refined GPS output* for each image against its *ground-truth CP*. The Haversine distance (error in meters) is calculated for all 1000 images. +4. This yields a list of 1000 error values. +5. **Test_Accuracy_50m (AC-1):** ASSERT (count(errors < 50m) / 1000) >= 0.80 +6. **Test_Accuracy_20m (AC-2):** ASSERT (count(errors < 20m) / 1000) >= 0.60 +7. **Test_Outlier_Rate (AC-5):** ASSERT (count(un-localized_images) / 1000) < 0.10 +8. **Test_Image_Registration_Rate (AC-8):** ASSERT (count(localized_images) / 1000) > 0.95 +9. **Test_Mean_Reprojection_Error (AC-9):** ASSERT (Back-End.final_MRE) < 1.0 +10. **Test_RMSE:** The overall Root Mean Square Error (RMSE) of the entire trajectory will be calculated as a primary performance benchmark.59 + +### **Integration and Functional Tests (AC-3, AC-4, AC-6)** + +These tests validate the system's logic and robustness to failure modes.62 + +* Test_Low_Overlap_Relocalization (AC-4): + * **Setup:** Create a test sequence of 50 images. From this, manually delete images 20-24 (simulating 5 lost frames during a sharp turn).63 + * **Test:** Run the system on this "broken" sequence. + * **Pass/Fail:** The system must report "Tracking Lost" at frame 20, initiate a new "chunk," and then "Tracking Re-acquired" and "Maps Merged" when the CVGL module successfully localizes frame 25 (or a subsequent frame). The final trajectory error for frame 25 must be < 50m. +* Test_350m_Outlier_Rejection (AC-3): + * **Setup:** Create a test sequence. At image 30, insert a "rogue" image (Image 30b) known to be 350m away. + * **Test:** Run the system on this sequence (..., 29, 30, 30b, 31,...). + * **Pass/Fail:** The system must correctly identify Image 30b as an outlier (RANSAC failure 56), reject it (or jump to its CVGL-verified pose), and "correctly continue the work" by successfully tracking Image 31 from Image 30 (using the frame-skipping logic). The trajectory must not be corrupted. +* Test_User_Intervention_Prompt (AC-6): + * **Setup:** Create a test sequence with 50 consecutive "bad" frames (e.g., pure sky, lens cap) to ensure the transient and chunking logics are bypassed. + * **Test:** Run the system. + * **Pass/Fail:** The system must enter a "LOST" state, attempt and fail to relocalize via CVGL for 50 frames, and then correctly trigger the "ask for user input" event. + +### **Non-Functional Tests (AC-7, AC-8, Hardware)** + +These tests validate performance and resource requirements.66 + +* Test_Performance_Per_Image (AC-7): + * **Setup:** Run the 1000-image test set on the minimum-spec RTX 2060. + * **Test:** Measure the time from "Image In" to "Initial Pose Out" for every frame. + * **Pass/Fail:** ASSERT average_time < 5.0s. +* Test_Streaming_Refinement (AC-8): + * **Setup:** Run the 1000-image test set. + * **Test:** A logger must verify that *two* poses are received for >80% of images: an "Initial" pose (T < 5s) and a "Refined" pose (T > 5s, after CVGL). + * **Pass/Fail:** The refinement mechanism is functioning correctly. +* Test_Scalability_Large_Route (Constraints): + * **Setup:** Run the system on a full 3000-image dataset. + * **Test:** Monitor system RAM, VRAM, and processing time per frame over the entire run. + * **Pass/Fail:** The system must complete the run without memory leaks, and the processing time per image must not degrade significantly as the pose graph grows. + +Identify all potential weak points and problems. Address them and find out ways to solve them. Based on your findings, form a new solution draft in the same format. + +If your finding requires a complete reorganization of the flow and different components, state it. +Put all the findings regarding what was weak and poor at the beginning of the report. Put here all new findings, what was updated, replaced, or removed from the previous solution. + +Then form a new solution design without referencing the previous system. Remove Poor and Very Poor component choices from the component analysis tables, but leave Good and Excellent ones. +In the updated report, do not put "new" marks, do not compare to the previous solution draft, just make a new solution as if from scratch \ No newline at end of file diff --git a/docs/00_problem/1.4_02_assesment_prompt copy.md b/docs/00_problem/1.4_02_assesment_prompt copy.md new file mode 100644 index 0000000..d6b23ee --- /dev/null +++ b/docs/00_problem/1.4_02_assesment_prompt copy.md @@ -0,0 +1,325 @@ +Read carefully about the problem: + + We have a lot of images taken from a wing-type UAV using a camera with at least Full HD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flights, it could be FullHD +Photos are taken and named consecutively within 100 meters of each other. +We know only the starting GPS coordinates. We need to determine the GPS of the centers of each image. And also the coordinates of the center of any object in these photos. We can use an external satellite provider for ground checks on the existing photos + + System has next restrictions and conditions: + - Photos are taken by only airplane type UAVs. + - Photos are taken by the camera pointing downwards and fixed, but it is not autostabilized. + - The flying range is restricted by the eastern and southern parts of Ukraine (To the left of the Dnipro River) + - The image resolution could be from FullHD to 6252*4168. Camera parameters are known: focal length, sensor width, resolution and so on. + - Altitude is predefined and no more than 1km. The height of the terrain can be neglected. + - There is NO data from IMU + - Flights are done mostly in sunny weather + - We can use satellite providers, but we're limited right now to Google Maps, which could be outdated for some regions + - Number of photos could be up to 3000, usually in the 500-1500 range + - During the flight, UAVs can make sharp turns, so that the next photo may be absolutely different from the previous one (no same objects), but it is rather an exception than the rule + - Processing is done on a stationary computer or laptop with NVidia GPU at least RTX2060, better 3070. (For the UAV solution Jetson Orin Nano would be used, but that is out of scope.) + + Output of the system should address next acceptance criteria: + - The system should find out the GPS of centers of 80% of the photos from the flight within an error of no more than 50 meters in comparison to the real GPS + - The system should find out the GPS of centers of 60% of the photos from the flight within an error of no more than 20 meters in comparison to the real GPS + - The system should correctly continue the work even in the presence of up to 350 meters of an outlier photo between 2 consecutive pictures en route. This could happen due to tilt of the plane. + - System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. The next photo should be in less than 150m drift and at an angle of less than 50% + - The number of outliers during the satellite provider images ground check should be less than 10% + - In case of being absolutely incapable of determining the system to determine next, second next, and third next images GPS, by any means (these 20% of the route), then it should ask the user for input for the next image, so that the user can specify the location + - Less than 5 seconds for processing one image + - Results of image processing should appear immediately to user, so that user shouldn't wait for the whole route to complete in order to analyze first results. Also, system could refine existing calculated results and send refined results again to user + - Image Registration Rate > 95%. The system can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory + - Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. + + Here is a solution draft: + + # **GEORTOLS-SA UAV Image Geolocalization in IMU-Denied Environments** + + The GEORTOLS-SA system is an asynchronous, four-component software solution designed for deployment on an NVIDIA RTX 2060+ GPU. It is architected from the ground up to handle the specific challenges of IMU-denied, scale-aware localization and real-time streaming output. + + ### **Product Solution Description** + + * **Inputs:** + 1. A sequence of consecutively named images (FullHD to 6252x4168). + 2. The absolute GPS coordinate (Latitude, Longitude) for the first image (Image 0). + 3. A pre-calibrated camera intrinsic matrix ($K$). + 4. The predefined, absolute metric altitude of the UAV ($H$, e.g., 900 meters). + 5. API access to the Google Maps satellite provider. + * **Outputs (Streaming):** + 1. **Initial Pose (T \< 5s):** A high-confidence, *metric-scale* estimate ($Pose\_N\_Est$) of the image's 6-DoF pose and GPS coordinate. This is sent to the user immediately upon calculation (AC-7, AC-8). + 2. **Refined Pose (T > 5s):** A globally-optimized pose ($Pose\_N\_Refined$) sent asynchronously as the back-end optimizer fuses data from the CVGL module (AC-8). + + ### **Component Interaction Diagram and Data Flow** + + The system is architected as four parallel-processing components to meet the stringent real-time and refinement requirements. + + 1. **Image Ingestion & Pre-processing:** This module receives the new, high-resolution Image_N. It immediately creates two copies: + * Image_N_LR (Low-Resolution, e.g., 1536x1024): This copy is immediately dispatched to the SA-VO Front-End for real-time processing. + * Image_N_HR (High-Resolution, 6.2K): This copy is stored and made available to the CVGL Module for its asynchronous, high-accuracy matching pipeline. + 2. **Scale-Aware VO (SA-VO) Front-End (High-Frequency Thread):** This component's sole task is high-speed, *metric-scale* relative pose estimation. It matches Image_N_LR to Image_N-1_LR, computes the 6-DoF relative transform, and critically, uses the "known altitude" ($H$) constraint to recover the absolute scale (detailed in Section 3.0). It sends this high-confidence Relative_Metric_Pose to the Back-End. + 3. **Cross-View Geolocalization (CVGL) Module (Low-Frequency, Asynchronous Thread):** This is a heavier, slower module. It takes Image_N (both LR and HR) and queries the Google Maps database to find an *absolute GPS pose*. When a high-confidence match is found, its Absolute_GPS_Pose is sent to the Back-End as a global "anchor" constraint. + 4. **Trajectory Optimization Back-End (Central Hub):** This component manages the complete flight trajectory as a pose graph.10 It continuously fuses two distinct, high-quality data streams: + * **On receiving Relative_Metric_Pose (T \< 5s):** It appends this pose to the graph, calculates the Pose_N_Est, and **sends this initial result to the user (AC-7, AC-8 met)**. + * **On receiving Absolute_GPS_Pose (T > 5s):** It adds this as a high-confidence "global anchor" constraint 12, triggers a full graph re-optimization to correct any minor biases, and **sends the Pose_N_Refined to the user (AC-8 refinement met)**. + + ### + + ### **VO "Trust Model" of GEORTOLS-SA** + + In GEORTOLS-SA, the trust model: + + * The **SA-VO Front-End** is now *highly trusted* for its local, frame-to-frame *metric* accuracy. + * The **CVGL Module** remains *highly trusted* for its *global* (GPS) accuracy. + + Both components are operating in the same scale-aware, metric space. The Back-End's job is no longer to fix a broken, drifting VO. Instead, it performs a robust fusion of two independent, high-quality metric measurements.12 + + This model is self-correcting. If the user's predefined altitude $H$ is slightly incorrect (e.g., entered as 900m but is truly 880m), the SA-VO front-end will be *consistently* off by a small percentage. The periodic, high-confidence CVGL "anchors" will create a consistent, low-level "tension" in the pose graph. The graph optimizer (e.g., Ceres Solver) 3 will resolve this tension by slightly "pulling" the SA-VO poses to fit the global anchors, effectively *learning* and correcting for the altitude bias. This robust fusion is the key to meeting the 20-meter and 50-meter accuracy targets (AC-1, AC-2). + + ## **3.0 Core Component: The Scale-Aware Visual Odometry (SA-VO) Front-End** + + This component is the new, critical engine of the system. Its sole task is to compute the *metric-scale* 6-DoF relative motion between consecutive frames, thereby eliminating scale drift at its source. + + ### **3.1 Rationale and Mechanism for Per-Frame Scale Recovery** + + The SA-VO front-end implements a geometric algorithm to recover the absolute scale $s$ for *every* frame-to-frame transition. This algorithm directly leverages the query's "known altitude" ($H$) and "planar ground" constraints.5 + + The SA-VO algorithm for processing Image_N (relative to Image_N-1) is as follows: + + 1. **Feature Matching:** Extract and match robust features between Image_N and Image_N-1 using the selected feature matcher (see Section 3.2). This yields a set of corresponding 2D pixel coordinates. + 2. **Essential Matrix:** Use RANSAC (Random Sample Consensus) and the camera intrinsic matrix $K$ to compute the Essential Matrix $E$ from the "inlier" correspondences.2 + 3. **Pose Decomposition:** Decompose $E$ to find the relative Rotation $R$ and the *unscaled* translation vector $t$, where the magnitude $||t||$ is fixed to 1.2 + 4. **Triangulation:** Triangulate the 3D-world points $X$ for all inlier features using the unscaled pose $$.15 These 3D points ($X_i$) are now in a local, *unscaled* coordinate system (i.e., we know the *shape* of the point cloud, but not its *size*). + 5. **Ground Plane Fitting:** The query states "terrain height can be neglected," meaning we assume a planar ground. A *second* RANSAC pass is performed, this time fitting a 3D plane to the set of triangulated 3D points $X$. The inliers to this RANSAC are identified as the ground points $X_g$.5 This method is highly robust as it does not rely on a single point, but on the consensus of all visible ground features.16 + 6. **Unscaled Height ($h$):** From the fitted plane equation n^T X + d = 0, the parameter $d$ represents the perpendicular distance from the camera (at the coordinate system's origin) to the computed ground plane. This is our *unscaled* height $h$. + 7. **Scale Computation:** We now have two values: the *real, metric* altitude $h$ (e.g., 900m) provided by the user, and our *computed, unscaled* altitude $h$. The absolute scale $s$ for this frame is the ratio of these two values: s = h / h. + 8. **Metric Pose:** The final, metric-scale relative pose is $$, where the metric translation $T = s * t$. This high-confidence, scale-aware pose is sent to the Back-End. + + ### **3.2 Feature Matching Sub-System Analysis** + + The success of the SA-VO algorithm depends *entirely* on the quality of the initial feature matches, especially in the low-texture agricultural terrain specified in the query. The system requires a matcher that is both robust (for sparse textures) and extremely fast (for AC-7). + + The initial draft's choice of SuperGlue 17 is a strong, proven baseline. However, its successor, LightGlue 18, offers a critical, non-obvious advantage: **adaptivity**. + + The UAV flight is specified as *mostly* straight, with high overlap. Sharp turns (AC-4) are "rather an exception." This means \~95% of our image pairs are "easy" to match, while 5% are "hard." + + * SuperGlue uses a fixed-depth Graph Neural Network (GNN), spending the *same* (large) amount of compute on an "easy" pair as a "hard" pair.19 This is inefficient. + * LightGlue is *adaptive*.19 For an easy, high-overlap pair, it can exit early (e.g., at layer 3/9), returning a high-confidence match in a fraction of the time. For a "hard" low-overlap pair, it will use its full depth to get the best possible result.19 + + By using LightGlue, the system saves *enormous* amounts of computational budget on the 95% of "easy" frames, ensuring it *always* meets the \<5s budget (AC-7) and reserving that compute for the harder CVGL tasks. LightGlue is a "plug-and-play replacement" 19 that is faster, more accurate, and easier to train.19 + + ### **Table 1: Analysis of State-of-the-Art Feature Matchers (For SA-VO Front-End)** + + | Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | + | :---- | :---- | :---- | :---- | :---- | + | **SuperPoint + SuperGlue** 17 | - SOTA robustness in low-texture, high-blur conditions. - GNN reasons about 3D scene context. - Proven in real-time SLAM systems.22 | - Computationally heavy (fixed-depth GNN). - Slower than LightGlue.19 - Training is complex.19 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.25 | **Good.** A solid, baseline choice. Meets robustness needs but will heavily tax the \<5s time budget (AC-7). | + | **SuperPoint + LightGlue** 18 | - **Adaptive Depth:** Faster on "easy" pairs, more accurate on "hard" pairs.19 - **Faster & Lighter:** Outperforms SuperGlue on speed and accuracy.19 - **Easier to Train:** Simpler architecture and loss.19 - Direct plug-and-play replacement for SuperGlue. | - Newer, less long-term-SLAM-proven than SuperGlue (though rapidly being adopted). | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.28 | **Excellent (Selected).** The adaptive nature is *perfect* for this problem. It saves compute on the 95% of easy (straight) frames, preserving the budget for the 5% of hard (turn) frames, maximizing our ability to meet AC-7. | + + ### **3.3 Selected Approach (SA-VO): SuperPoint + LightGlue** + + The SA-VO front-end will be built using: + + * **Detector:** **SuperPoint** 24 to detect sparse, robust features on the Image_N_LR. + * **Matcher:** **LightGlue** 18 to match features from Image_N_LR to Image_N-1_LR. + + This combination provides the SOTA robustness required for low-texture fields, while LightGlue's adaptive performance 19 is the key to meeting the \<5s (AC-7) real-time requirement. + + ## **4.0 Global Anchoring: The Cross-View Geolocalization (CVGL) Module** + + With the SA-VO front-end handling metric scale, the CVGL module's task is refined. Its purpose is no longer to *correct scale*, but to provide *absolute global "anchor" poses*. This corrects for any accumulated bias (e.g., if the $h$ prior is off by 5m) and, critically, *relocalizes* the system after a persistent tracking loss (AC-4). + + ### **4.1 Hierarchical Retrieval-and-Match Pipeline** + + This module runs asynchronously and is computationally heavy. A brute-force search against the entire Google Maps database is impossible. A two-stage hierarchical pipeline is required: + + 1. **Stage 1: Coarse Retrieval.** This is treated as an image retrieval problem.29 + * A **Siamese CNN** 30 (or similar Dual-CNN architecture) is used to generate a compact "embedding vector" (a digital signature) for the Image_N_LR. + * An embedding database will be pre-computed for *all* Google Maps satellite tiles in the specified Eastern Ukraine operational area. + * The UAV image's embedding is then used to perform a very fast (e.g., FAISS library) similarity search against the satellite database, returning the *Top-K* (e.g., K=5) most likely-matching satellite tiles. + 2. **Stage 2: Fine-Grained Pose.** + * *Only* for these Top-5 candidates, the system performs the heavy-duty **SuperPoint + LightGlue** matching. + * This match is *not* Image_N -> Image_N-1. It is Image_N -> Satellite_Tile_K. + * The match with the highest inlier count and lowest reprojection error (MRE \< 1.0, AC-10) is used to compute the precise 6-DoF pose of the UAV relative to that georeferenced satellite tile. This yields the final Absolute_GPS_Pose. + + ### **4.2 Critical Insight: Solving the Oblique-to-Nadir "Domain Gap"** + + A critical, unaddressed failure mode exists. The query states the camera is **"not autostabilized"** [User Query]. On a fixed-wing UAV, this guarantees that during a bank or sharp turn (AC-4), the camera will *not* be nadir (top-down). It will be *oblique*, capturing the ground from an angle. The Google Maps reference, however, is *perfectly nadir*.32 + + This creates a severe "domain gap".33 A CVGL system trained *only* to match nadir-to-nadir images will *fail* when presented with an oblique UAV image.34 This means the CVGL module will fail *precisely* when it is needed most: during the sharp turns (AC-4) when SA-VO tracking is also lost. + + The solution is to *close this domain gap* during training. Since the real-world UAV images will be oblique, the network must be taught to match oblique views to nadir ones. + + Solution: Synthetic Data Generation for Robust Training + The Stage 1 Siamese CNN 30 must be trained on a custom, synthetically-generated dataset.37 The process is as follows: + + 1. Acquire nadir satellite imagery and a corresponding Digital Elevation Model (DEM) for the operational area. + 2. Use this data to *synthetically render* the nadir satellite imagery from a wide variety of *oblique* viewpoints, simulating the UAV's roll and pitch.38 + 3. Create thousands of training pairs, each consisting of (Nadir_Satellite_Tile, Synthetically_Oblique_Tile_Angle_30_Deg). + 4. Train the Siamese network 29 to learn that these two images—despite their *vastly* different appearances—are a *match*. + + This process teaches the retrieval network to be *viewpoint-invariant*.35 It learns to ignore perspective distortion and match the true underlying ground features (road intersections, field boundaries). This is the *only* way to ensure the CVGL module can robustly relocalize the UAV during a sharp turn (AC-4). + + ## **5.0 Trajectory Fusion: The Robust Optimization Back-End** + + This component is the system's central "brain." It runs continuously, fusing all incoming measurements (high-frequency/metric-scale SA-VO poses, low-frequency/globally-absolute CVGL poses) into a single, globally consistent trajectory. This component's design is dictated by the requirements for streaming (AC-8), refinement (AC-8), and outlier-rejection (AC-3). + + ### **5.1 Selected Strategy: Incremental Pose-Graph Optimization** + + The user's requirements for "results...appear immediately" and "system could refine existing calculated results" [User Query] are a textbook description of a real-time SLAM back-end.11 A batch Structure from Motion (SfM) process, which requires all images upfront and can take hours, is unsuitable for the primary system. + + ### **Table 2: Analysis of Trajectory Optimization Strategies** + + | Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | + | :---- | :---- | :---- | :---- | :---- | + | **Incremental SLAM (Pose-Graph Optimization)** (g2o 13, Ceres Solver 10, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates (AC-7). - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (CVGL) data arrives (AC-8).11 - **Robust:** Can handle outliers via robust kernels.39 | - Initial estimate is less accurate than a full batch process. - Can drift *if* not anchored (though our SA-VO minimizes this). | - A graph optimization library (g2o, Ceres). - A robust cost function.41 | **Excellent (Selected).** This is the *only* architecture that satisfies all user requirements for real-time streaming and asynchronous refinement. | + | **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory. | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails AC-7 and AC-8 completely. | - All images must be available before processing starts. - High RAM and CPU. | **Good (as an *Optional* Post-Processing Step).** Unsuitable as the primary online system, but could be offered as an optional, high-accuracy "Finalize Trajectory" batch process after the flight. | + + The system's back-end will be built as an **Incremental Pose-Graph Optimizer** using **Ceres Solver**.10 Ceres is selected due to its large user community, robust documentation, excellent support for robust loss functions 10, and proven scalability for large-scale nonlinear least-squares problems.42 + + ### **5.2 Mechanism for Automatic Outlier Rejection (AC-3, AC-5)** + + The system must "correctly continue the work even in the presence of up to 350 meters of an outlier" (AC-3). A standard least-squares optimizer would be catastrophically corrupted by this event, as it would try to *average* this 350m error, pulling the *entire* 300km trajectory out of alignment. + + A modern optimizer does not need to use brittle, hand-coded if-then logic to reject outliers. It can *mathematically* and *automatically* down-weight them using **Robust Loss Functions (Kernels)**.41 + + The mechanism is as follows: + + 1. The Ceres Back-End 10 maintains a graph of nodes (poses) and edges (constraints, or measurements). + 2. A 350m outlier (AC-3) will create an edge with a *massive* error (residual). + 3. A standard (quadratic) loss function $cost(error) = error^2$ would create a *catastrophic* cost, forcing the optimizer to ruin the entire graph to accommodate it. + 4. Instead, the system will wrap its cost functions in a **Robust Loss Function**, such as **CauchyLoss** or **HuberLoss**.10 + 5. A robust loss function behaves quadratically for small errors (which it tries hard to fix) but becomes *sub-linear* for large errors. When it "sees" the 350m error, it mathematically *down-weights its influence*.43 + 6. The optimizer effectively *acknowledges* the 350m error but *refuses* to pull the entire graph to fix this one "insane" measurement. It automatically, and gracefully, treats the outlier as a "lost cause" and optimizes the 99.9% of "sane" measurements. This is the modern, robust solution to AC-3 and AC-5. + + ## **6.0 High-Resolution (6.2K) and Performance Optimization** + + The system must simultaneously handle massive 6252x4168 (26-Megapixel) images and run on a modest RTX 2060 GPU [User Query] with a \<5s time limit (AC-7). These are opposing constraints. + + ### **6.1 The Multi-Scale Patch-Based Processing Pipeline** + + Running *any* deep learning model (SuperPoint, LightGlue) on a full 6.2K image will be impossibly slow and will *immediately* cause a CUDA Out-of-Memory (OOM) error on a 6GB RTX 2060.45 + + The solution is not to process the full 6.2K image in real-time. Instead, a **multi-scale, patch-based pipeline** is required, where different components use the resolution best suited to their task.46 + + 1. **For SA-VO (Real-time, \<5s):** The SA-VO front-end is concerned with *motion*, not fine-grained detail. The 6.2K Image_N_HR is *immediately* downscaled to a manageable 1536x1024 (Image_N_LR). The entire SA-VO (SuperPoint + LightGlue) pipeline runs *only* on this low-resolution, fast-to-process image. This is how the \<5s (AC-7) budget is met. + 2. **For CVGL (High-Accuracy, Async):** The CVGL module, which runs asynchronously, is where the 6.2K detail is *selectively* used to meet the 20m (AC-2) accuracy target. It uses a "coarse-to-fine" 48 approach: + * **Step A (Coarse):** The Siamese CNN 30 runs on the *downscaled* 1536px Image_N_LR to get a coarse [Lat, Lon] guess. + * **Step B (Fine):** The system uses this coarse guess to fetch the corresponding *high-resolution* satellite tile. + * **Step C (Patching):** The system runs the SuperPoint detector on the *full 6.2K* Image_N_HR to find the Top 100 *most confident* feature keypoints. It then extracts 100 small (e.g., 256x256) *patches* from the full-resolution image, centered on these keypoints.49 + * **Step D (Matching):** The system then matches *these small, full-resolution patches* against the high-res satellite tile. + + This hybrid method provides the best of both worlds: the fine-grained matching accuracy 50 of the 6.2K image, but without the catastrophic OOM errors or performance penalties.45 + + ### **6.2 Real-Time Deployment with TensorRT** + + PyTorch is a research and training framework. Its default inference speed, even on an RTX 2060, is often insufficient to meet a \<5s production requirement.23 + + For the final production system, the key neural networks (SuperPoint, LightGlue, Siamese CNN) *must* be converted from their PyTorch-native format into a highly-optimized **NVIDIA TensorRT engine**. + + * **Benefits:** TensorRT is an inference optimizer that applies graph optimizations, layer fusion, and precision reduction (e.g., to FP16).52 This can achieve a 2x-4x (or more) speedup over native PyTorch.28 + * **Deployment:** The resulting TensorRT engine can be deployed via a C++ API 25, which is far more suitable for a robust, high-performance production system. + + This conversion is a *mandatory* deployment step. It is what makes a 2-second inference (well within the 5-second AC-7 budget) *achievable* on the specified RTX 2060 hardware. + + ## **7.0 System Robustness: Failure Mode and Logic Escalation** + + The system's logic is designed as a multi-stage escalation process to handle the specific failure modes in the acceptance criteria (AC-3, AC-4, AC-6), ensuring the >95% registration rate (AC-9). + + ### **Stage 1: Normal Operation (Tracking)** + + * **Condition:** SA-VO(N-1 -> N) succeeds. The LightGlue match is high-confidence, and the computed scale $s$ is reasonable. + * **Logic:** + 1. The Relative_Metric_Pose is sent to the Back-End. + 2. The Pose_N_Est is calculated and sent to the user (\<5s). + 3. The CVGL module is queued to run asynchronously to provide a Pose_N_Refined at a later time. + + ### **Stage 2: Transient SA-VO Failure (AC-3 Outlier Handling)** + + * **Condition:** SA-VO(N-1 -> N) fails. This could be a 350m outlier (AC-3), a severely blurred image, or an image with no features (e.g., over a cloud). The LightGlue match fails, or the computed scale $s$ is nonsensical. + * **Logic (Frame Skipping):** + 1. The system *buffers* Image_N and marks it as "tentatively lost." + 2. When Image_N+1 arrives, the SA-VO front-end attempts to "bridge the gap" by matching SA-VO(N-1 -> N+1). + 3. **If successful:** A Relative_Metric_Pose for N+1 is found. Image_N is officially marked as a rejected outlier (AC-5). The system "correctly continues the work" (AC-3 met). + 4. **If fails:** The system repeats for SA-VO(N-1 -> N+2). + 5. If this "bridging" fails for 3 consecutive frames, the system concludes it is not a transient outlier but a persistent tracking loss, and escalates to Stage 3. + + ### **Stage 3: Persistent Tracking Loss (AC-4 Sharp Turn Handling)** + + * **Condition:** The "frame-skipping" in Stage 2 fails. This is the "sharp turn" scenario [AC-4] where there is \<5% overlap between Image_N-1 and Image_N+k. + * **Logic (Multi-Map "Chunking"):** + 1. The Back-End declares a "Tracking Lost" state at Image_N and creates a *new, independent map chunk* ("Chunk 2"). + 2. The SA-VO Front-End is re-initialized at Image_N and begins populating this new chunk, tracking SA-VO(N -> N+1), SA-VO(N+1 -> N+2), etc. + 3. Because the front-end is **Scale-Aware**, this new "Chunk 2" is *already in metric scale*. It is a "floating island" of *known size and shape*; it just is not anchored to the global GPS map. + * **Resolution (Asynchronous Relocalization):** + 1. The **CVGL Module** is now tasked, high-priority, to find a *single* Absolute_GPS_Pose for *any* frame in this new "Chunk 2". + 2. Once the CVGL module (which is robust to oblique views, per Section 4.2) finds one (e.g., for Image_N+20), the Back-End has all the information it needs. + 3. **Merging:** The Back-End calculates the simple 6-DoF transformation (3D translation and rotation, scale=1) to align all of "Chunk 2" and merge it with "Chunk 1". This robustly handles the "correctly continue the work" criterion (AC-4). + + ### **Stage 4: Catastrophic Failure (AC-6 User Intervention)** + + * **Condition:** The system has entered Stage 3 and is building "Chunk 2," but the **CVGL Module** has *also* failed for a prolonged period (e.g., 20% of the route, or 50+ consecutive frames). This is the "worst-case" scenario (e.g., heavy clouds *and* over a large, featureless lake). The system is "absolutely incapable" [User Query]. + * **Logic:** + 1. The system has a metric-scale "Chunk 2" but zero idea where it is in the world. + 2. The Back-End triggers the AC-6 flag. + * **Resolution (User Input):** + 1. The UI prompts the user: "Tracking lost. Please provide a coarse location for the *current* image." + 2. The UI displays the last known good image (from Chunk 1) and the new, "lost" image (e.g., Image_N+50). + 3. The user clicks *one point* on the satellite map. + 4. This user-provided [Lat, Lon] is *not* taken as ground truth. It is fed to the CVGL module as a *strong prior*, drastically narrowing its search area from "all of Ukraine" to "a 10km-radius circle." + 5. This allows the CVGL module to re-acquire a lock, which triggers the Stage 3 merge, and the system continues. + + ## **8.0 Output Generation and Validation Strategy** + + This section details how the final user-facing outputs are generated and how the system's compliance with all 10 acceptance criteria will be validated. + + ### **8.1 Generating Object-Level GPS (from Pixel Coordinate)** + + This meets the requirement to find the "coordinates of the center of any object in these photos" [User Query]. The system provides this via a **Ray-Plane Intersection** method. + + * **Inputs:** + 1. The user clicks pixel coordinate $(u,v)$ on Image_N. + 2. The system retrieves the refined, global 6-DoF pose $$ for Image_N from the Back-End. + 3. The system uses the known camera intrinsic matrix $K$. + 4. The system uses the known *global ground-plane equation* (e.g., $Z=150m$, based on the predefined altitude and start coordinate). + * **Method:** + 1. **Un-project Pixel:** The 2D pixel $(u,v)$ is un-projected into a 3D ray *direction* vector $d_{cam}$ in the camera's local coordinate system: $d_{cam} \= K^{-1} \cdot [u, v, 1]^T$. + 2. **Transform Ray:** This ray direction is transformed into the *global* coordinate system using the pose's rotation matrix: $d_{global} \= R \cdot d_{cam}$. + 3. **Define Ray:** A 3D ray is now defined, originating at the camera's global position $T$ (from the pose) and traveling in the direction $d_{global}$. + 4. **Intersect:** The system solves the 3D line-plane intersection equation for this ray and the known global ground plane (e.g., find the intersection with $Z=150m$). + 5. **Result:** The 3D intersection point $(X, Y, Z)$ is the *metric* world coordinate of the object on the ground. + 6. **Convert:** This $(X, Y, Z)$ world coordinate is converted to a [Latitude, Longitude, Altitude] GPS coordinate. This process is immediate and can be performed for any pixel on any geolocated image. + + ### **8.2 Rigorous Validation Methodology** + + A comprehensive test plan is required to validate all 10 acceptance criteria. The foundation of this is the creation of a **Ground-Truth Test Harness**. + + * **Test Harness:** + 1. **Ground-Truth Data:** Several test flights will be conducted in the operational area using a UAV equipped with a high-precision RTK/PPK GPS. This provides the "real GPS" (ground truth) for every image. + 2. **Test Datasets:** Multiple test datasets will be curated from this ground-truth data: + * Test_Baseline_1000: A standard 1000-image flight. + * Test_Outlier_350m (AC-3): Test_Baseline_1000 with a single image from 350m away manually inserted at frame 30. + * Test_Sharp_Turn_5pct (AC-4): A sequence where frames 20-24 are manually deleted, simulating a \<5% overlap jump. + * Test_Catastrophic_Fail_20pct (AC-6): A sequence with 200 (20%) consecutive "bad" frames (e.g., pure sky, lens cap) inserted. + * Test_Full_3000: A full 3000-image sequence to test scalability and memory usage. + * **Test Cases:** + * **Test_Accuracy (AC-1, AC-2, AC-5, AC-9):** + * Run Test_Baseline_1000. A test script will compare the system's *final refined GPS output* for each image against its *ground-truth GPS*. + * ASSERT (count(errors \< 50m) / 1000) \geq 0.80 (AC-1) + * ASSERT (count(errors \< 20m) / 1000) \geq 0.60 (AC-2) + * ASSERT (count(un-localized_images) / 1000) \< 0.10 (AC-5) + * ASSERT (count(localized_images) / 1000) > 0.95 (AC-9) + * **Test_MRE (AC-10):** + * ASSERT (BackEnd.final_MRE) \< 1.0 (AC-10) + * **Test_Performance (AC-7, AC-8):** + * Run Test_Full_3000 on the minimum-spec RTX 2060. + * Log timestamps for "Image In" -> "Initial Pose Out". ASSERT average_time \< 5.0s (AC-7). + * Log the output stream. ASSERT that >80% of images receive *two* poses: an "Initial" and a "Refined" (AC-8). + * **Test_Robustness (AC-3, AC-4, AC-6):** + * Run Test_Outlier_350m. ASSERT the system correctly continues and the final trajectory error for Image_31 is \< 50m (AC-3). + * Run Test_Sharp_Turn_5pct. ASSERT the system logs "Tracking Lost" and "Maps Merged," and the final trajectory is complete and accurate (AC-4). + * Run Test_Catastrophic_Fail_20pct. ASSERT the system correctly triggers the "ask for user input" event (AC-6). + +Identify all potential weak points and problems. Address them and find out ways to solve them. Based on your findings, form a new solution draft in the same format. + +If your finding requires a complete reorganization of the flow and different components, state it. +Put all the findings regarding what was weak and poor at the beginning of the report. Put here all new findings, what was updated, replaced, or removed from the previous solution. + +Then form a new solution design without referencing the previous system. Remove Poor and Very Poor component choices from the component analysis tables, but leave Good and Excellent ones. +In the updated report, do not put "new" marks, do not compare to the previous solution draft, just make a new solution as if from scratch \ No newline at end of file diff --git a/docs/00_problem/1.4_03_assesment_prompt.md b/docs/00_problem/1.4_03_assesment_prompt.md new file mode 100644 index 0000000..71d63b0 --- /dev/null +++ b/docs/00_problem/1.4_03_assesment_prompt.md @@ -0,0 +1,301 @@ +Read carefully about the problem: + + We have a lot of images taken from a wing-type UAV using a camera with at least Full HD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flights, it could be FullHD +Photos are taken and named consecutively within 100 meters of each other. +We know only the starting GPS coordinates. We need to determine the GPS of the centers of each image. And also the coordinates of the center of any object in these photos. We can use an external satellite provider for ground checks on the existing photos + + System has next restrictions and conditions: + - Photos are taken by only airplane type UAVs. + - Photos are taken by the camera pointing downwards and fixed, but it is not autostabilized. + - The flying range is restricted by the eastern and southern parts of Ukraine (To the left of the Dnipro River) + - The image resolution could be from FullHD to 6252*4168. Camera parameters are known: focal length, sensor width, resolution and so on. + - Altitude is predefined and no more than 1km. The height of the terrain can be neglected. + - There is NO data from IMU + - Flights are done mostly in sunny weather + - We can use satellite providers, but we're limited right now to Google Maps, which could be outdated for some regions + - Number of photos could be up to 3000, usually in the 500-1500 range + - During the flight, UAVs can make sharp turns, so that the next photo may be absolutely different from the previous one (no same objects), but it is rather an exception than the rule + - Processing is done on a stationary computer or laptop with NVidia GPU at least RTX2060, better 3070. (For the UAV solution Jetson Orin Nano would be used, but that is out of scope.) + + Output of the system should address next acceptance criteria: + - The system should find out the GPS of centers of 80% of the photos from the flight within an error of no more than 50 meters in comparison to the real GPS + - The system should find out the GPS of centers of 60% of the photos from the flight within an error of no more than 20 meters in comparison to the real GPS + - The system should correctly continue the work even in the presence of up to 350 meters of an outlier photo between 2 consecutive pictures en route. This could happen due to tilt of the plane. + - System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. The next photo should be in less than 150m drift and at an angle of less than 50% + - The number of outliers during the satellite provider images ground check should be less than 10% + - In case of being absolutely incapable of determining the system to determine next, second next, and third next images GPS, by any means (these 20% of the route), then it should ask the user for input for the next image, so that the user can specify the location + - Less than 5 seconds for processing one image + - Results of image processing should appear immediately to user, so that user shouldn't wait for the whole route to complete in order to analyze first results. Also, system could refine existing calculated results and send refined results again to user + - Image Registration Rate > 95%. The system can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory + - Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. + + Here is a solution draft: + + **GEORTEX-R: A Geospatial-Temporal Robust Extraction System for IMU-Denied UAV Geolocalization** + + ## **1.0 GEORTEX-R: System Architecture and Data Flow** + + The GEORTEX-R system is an asynchronous, three-component software solution designed for deployment on an NVIDIA RTX 2060+ GPU. It is architected from the ground up to handle the specific, demonstrated challenges of IMU-denied localization in *non-planar terrain* (as seen in Images 1-9) and *temporally-divergent* (outdated) reference maps (AC-5). + + The system's core design principle is the *decoupling of unscaled relative motion from global metric scale*. The front-end estimates high-frequency, robust, but *unscaled* motion. The back-end asynchronously provides sparse, high-confidence *metric* and *geospatial* anchors. The central hub fuses these two data streams into a single, globally-optimized, metric-scale trajectory. + + ### **1.1 Inputs** + + 1. **Image Sequence:** Consecutively named images (FullHD to 6252x4168). + 2. **Start Coordinate (Image 0):** A single, absolute GPS coordinate (Latitude, Longitude) for the first image. + 3. **Camera Intrinsics ($K$):** A pre-calibrated camera intrinsic matrix. + 4. **Altitude Prior ($H_{prior}$):** The *approximate* predefined metric altitude (e.g., 900 meters). This is used as a *prior* (a hint) for optimization, *not* a hard constraint. + 5. **Geospatial API Access:** Credentials for an on-demand satellite and DEM provider (e.g., Copernicus, EOSDA). + + ### **1.2 Streaming Outputs** + + 1. **Initial Pose ($Pose\\_N\\_Est$):** An *unscaled* pose estimate. This is sent immediately to the UI for real-time visualization of the UAV's *path shape* (AC-7, AC-8). + 2. **Refined Pose ($Pose\\_N\\_Refined$) [Asynchronous]:** A globally-optimized, *metric-scale* 7-DoF pose (X, Y, Z, Qx, Qy, Qz, Qw) and its corresponding [Lat, Lon, Alt] coordinate. This is sent to the user whenever the Trajectory Optimization Hub re-converges, updating all past poses (AC-1, AC-2, AC-8). + + ### **1.3 Component Interaction and Data Flow** + + The system is architected as three parallel-processing components: + + 1. **Image Ingestion & Pre-processing:** This module receives the new Image_N (up to 6.2K). It creates two copies: + * Image_N_LR (Low-Resolution, e.g., 1536x1024): Dispatched *immediately* to the V-SLAM Front-End for real-time processing. + * Image_N_HR (High-Resolution, 6.2K): Stored for asynchronous use by the Geospatial Anchoring Back-End (GAB). + 2. **V-SLAM Front-End (High-Frequency Thread):** This component's sole task is high-speed, *unscaled* relative pose estimation. It tracks Image_N_LR against a *local map of keyframes*. It performs local bundle adjustment to minimize drift 12 and maintains a co-visibility graph of all keyframes. It sends Relative_Unscaled_Pose estimates to the Trajectory Optimization Hub (TOH). + 3. **Geospatial Anchoring Back-End (GAB) (Low-Frequency, Asynchronous Thread):** This is the system's "anchor." When triggered by the TOH, it fetches *on-demand* geospatial data (satellite imagery and DEMs) from an external API.3 It then performs a robust *hybrid semantic-visual* search 5 to find an *absolute, metric, global pose* for a given keyframe, robust to outdated maps (AC-5) 5 and oblique views (AC-4).14 This Absolute_Metric_Anchor is sent to the TOH. + 4. **Trajectory Optimization Hub (TOH) (Central Hub):** This component manages the complete flight trajectory as a **Sim(3) pose graph** (7-DoF). It continuously fuses two distinct data streams: + * **On receiving Relative_Unscaled_Pose (T \< 5s):** It appends this pose to the graph, calculates the Pose_N_Est, and sends this *unscaled* initial result to the user (AC-7, AC-8 met). + * **On receiving Absolute_Metric_Anchor (T > 5s):** This is the critical event. It adds this as a high-confidence *global metric constraint*. This anchor creates "tension" in the graph, which the optimizer (Ceres Solver 15) resolves by finding the *single global scale factor* that best fits all V-SLAM and CVGL measurements. It then triggers a full graph re-optimization, "stretching" the entire trajectory to the correct metric scale, and sends the new Pose_N_Refined stream to the user for all affected poses (AC-1, AC-2, AC-8 refinement met). + + ## **2.0 Core Component: The High-Frequency V-SLAM Front-End** + + This component's sole task is to robustly and accurately compute the *unscaled* 6-DoF relative motion of the UAV and build a geometrically-consistent map of keyframes. It is explicitly designed to be more robust to drift than simple frame-to-frame odometry. + + ### **2.1 Rationale: Keyframe-Based Monocular SLAM** + + The choice of a keyframe-based V-SLAM front-end over a frame-to-frame VO is deliberate and critical for system robustness. + + * **Drift Mitigation:** Frame-to-frame VO is "prone to drift accumulation due to errors introduced by each frame-to-frame motion estimation".13 A single poor match permanently corrupts all future poses. + * **Robustness:** A keyframe-based system tracks new images against a *local map* of *multiple* previous keyframes, not just Image_N-1. This provides resilience to transient failures (e.g., motion blur, occlusion). + * **Optimization:** This architecture enables "local bundle adjustment" 12, a process where a sliding window of recent keyframes is continuously re-optimized, actively minimizing error and drift *before* it can accumulate. + * **Relocalization:** This architecture possesses *innate relocalization capabilities* (see Section 6.3), which is the correct, robust solution to the "sharp turn" (AC-4) requirement. + + ### **2.2 Feature Matching Sub-System** + + The success of the V-SLAM front-end depends entirely on high-quality feature matches, especially in the sparse, low-texture agricultural terrain seen in the provided images (e.g., Image 6, Image 7). The system requires a matcher that is robust (for sparse textures 17) and extremely fast (for AC-7). + + The selected approach is **SuperPoint + LightGlue**. + + * **SuperPoint:** A SOTA (State-of-the-Art) feature detector proven to find robust, repeatable keypoints in challenging, low-texture conditions 17 + * **LightGlue:** A highly optimized GNN-based matcher that is the successor to SuperGlue 19 + + The key advantage of selecting LightGlue 19 over SuperGlue 20 is its *adaptive nature*. The query states sharp turns (AC-4) are "rather an exception." This implies \~95% of image pairs are "easy" (high-overlap, straight flight) and 5% are "hard" (low-overlap, turns). SuperGlue uses a fixed-depth GNN, spending the *same* large amount of compute on an "easy" pair as a "hard" one. LightGlue is *adaptive*.19 For an "easy" pair, it can exit its GNN early, returning a high-confidence match in a fraction of the time. This saves *enormous* computational budget on the 95% of "easy" frames, ensuring the system *always* meets the \<5s budget (AC-7) and reserving that compute for the GAB. + + #### **Table 1: Analysis of State-of-the-Art Feature Matchers (For V-SLAM Front-End)** + + | Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | + | :---- | :---- | :---- | :---- | :---- | + | **SuperPoint + SuperGlue** 20 | - SOTA robustness in low-texture, high-blur conditions. - GNN reasons about 3D scene context. - Proven in real-time SLAM systems. | - Computationally heavy (fixed-depth GNN). - Slower than LightGlue.19 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.21 | **Good.** A solid, baseline choice. Meets robustness needs but will heavily tax the \<5s time budget (AC-7). | + | **SuperPoint + LightGlue** 17 | - **Adaptive Depth:** Faster on "easy" pairs, more accurate on "hard" pairs.19 - **Faster & Lighter:** Outperforms SuperGlue on speed and accuracy. - SOTA "in practice" choice for large-scale matching.17 | - Newer, but rapidly being adopted and proven.21 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.22 | **Excellent (Selected).** The adaptive nature is *perfect* for this problem. It saves compute on the 95% of easy (straight) frames, maximizing our ability to meet AC-7. | + + ## **3.0 Core Component: The Geospatial Anchoring Back-End (GAB)** + + This component is the system's "anchor to reality." It runs asynchronously to provide the *absolute, metric-scale* constraints needed to solve the trajectory. It is an *on-demand* system that solves three distinct "domain gaps": the hardware/scale gap, the temporal gap, and the viewpoint gap. + + ### **3.1 On-Demand Geospatial Data Retrieval** + + A "pre-computed database" for all of Eastern Ukraine is operationally unfeasible on laptop-grade hardware.1 This design is replaced by an on-demand, API-driven workflow. + + * **Mechanism:** When the TOH requests a global anchor, the GAB receives a *coarse* [Lat, Lon] estimate. The GAB then performs API calls to a geospatial data provider (e.g., EOSDA 3, Copernicus 8). + * **Dual-Retrieval:** The API query requests *two* distinct products for the specified Area of Interest (AOI): + 1. **Visual Tile:** A high-resolution (e.g., 30-50cm) satellite ortho-image.26 + 2. **Terrain Tile:** The corresponding **Digital Elevation Model (DEM)**, such as the Copernicus GLO-30 (30m resolution) or SRTM (30m).7 + + This "Dual-Retrieval" mechanism is the central, enabling synergy of the new architecture. The **Visual Tile** is used by the CVGL (Section 3.2) to find the *geospatial pose*. The **DEM Tile** is used by the *output module* (Section 7.1) to perform high-accuracy **Ray-DEM Intersection**, solving the final output accuracy problem. + + ### **3.2 Hybrid Semantic-Visual Localization** + + The "temporal gap" (evidenced by burn scars in Images 1-9) and "outdated maps" (AC-5) makes a purely visual CVGL system unreliable.5 The GAB solves this using a robust, two-stage *hybrid* matching pipeline. + + 1. **Stage 1: Coarse Visual Retrieval (Siamese CNN).** A lightweight Siamese CNN 14 is used to find the *approximate* location of the Image_N_LR *within* the large, newly-fetched satellite tile. This acts as a "candidate generator." + 2. **Stage 2: Fine-Grained Semantic-Visual Fusion.** For the top candidates, the GAB performs a *dual-channel alignment*. + * **Visual Channel (Unreliable):** It runs SuperPoint+LightGlue on high-resolution *patches* (from Image_N_HR) against the satellite tile. This match may be *weak* due to temporal gaps.5 + * **Semantic Channel (Reliable):** It extracts *temporally-invariant* semantic features (e.g., road-vectors, field-boundaries, tree-cluster-polygons, lake shorelines) from *both* the UAV image (using a segmentation model) and the satellite/OpenStreetMap data.5 + * **Fusion:** A RANSAC-based optimizer finds the 6-DoF pose that *best aligns* this *hybrid* set of features. + + This hybrid approach is robust to the exact failure mode seen in the images. When matching Image 3 (burn scars), the *visual* LightGlue match will be poor. However, the *semantic* features (the dirt road, the tree line) are *unchanged*. The optimizer will find a high-confidence pose by *trusting the semantic alignment* over the poor visual alignment, thereby succeeding despite the "outdated map" (AC-5). + + ### **3.3 Solution to Viewpoint Gap: Synthetic Oblique View Training** + + This component is critical for handling "sharp turns" (AC-4). The camera *will* be oblique, not nadir, during turns. + + * **Problem:** The GAB's Stage 1 Siamese CNN 14 will be matching an *oblique* UAV view to a *nadir* satellite tile. This "viewpoint gap" will cause a match failure.14 + * **Mechanism (Synthetic Data Generation):** The network must be trained for *viewpoint invariance*.28 + 1. Using the on-demand DEMs (fetched in 3.1) and satellite tiles, the system can *synthetically render* the satellite imagery from *any* roll, pitch, and altitude. + 2. The Siamese network is trained on (Nadir_Tile, Synthetic_Oblique_Tile) pairs.14 + * **Result:** This process teaches the network to match the *underlying ground features*, not the *perspective distortion*. It ensures the GAB can relocalize the UAV *precisely* when it is needed most: during a sharp, banking turn (AC-4) when VO tracking has been lost. + + ## **4.0 Core Component: The Trajectory Optimization Hub (TOH)** + + This component is the system's central "brain." It runs continuously, fusing all measurements (high-frequency/unscaled V-SLAM, low-frequency/metric-scale GAB anchors) into a single, globally consistent trajectory. + + ### **4.1 Incremental Sim(3) Pose-Graph Optimization** + + The "planar ground" SA-VO (Finding 1) is removed. This component is its replacement. The system must *discover* the global scale, not *assume* it. + + * **Selected Strategy:** An incremental pose-graph optimizer using **Ceres Solver**.15 + * **The Sim(3) Insight:** The V-SLAM front-end produces *unscaled* 6-DoF ($SE(3)$) relative poses. The GAB produces *metric-scale* 6-DoF ($SE(3)$) *absolute* poses. These cannot be directly combined. The graph must be optimized in **Sim(3) (7-DoF)**, which adds a *single global scale factor $s$* as an optimizable variable. + * **Mechanism (Ceres Solver):** + 1. **Nodes:** Each keyframe pose (7-DoF: $X, Y, Z, Qx, Qy, Qz, s$). + 2. **Edge 1 (V-SLAM):** A relative pose constraint between Keyframe_i and Keyframe_j. The error is computed in Sim(3). + 3. **Edge 2 (GAB):** An *absolute* pose constraint on Keyframe_k. This constraint *fixes* Keyframe_k's pose to the *metric* GPS coordinate and *fixes its scale $s$ to 1.0*. + * **Bootstrapping Scale:** The TOH graph "bootstraps" the scale.32 The GAB's $s=1.0$ anchor creates "tension" in the graph. The Ceres optimizer 15 resolves this tension by finding the *one* global scale $s$ for all V-SLAM nodes that minimizes the total error, effectively "stretching" the entire unscaled trajectory to fit the metric anchors. This is robust to *any* terrain.34 + + #### **Table 2: Analysis of Trajectory Optimization Strategies** + + | Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | + | :---- | :---- | :---- | :---- | :---- | + | **Incremental SLAM (Pose-Graph Optimization)** (Ceres Solver 15, g2o 35, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates (AC-7). - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (GAB) data arrives (AC-8).13 - **Robust:** Can handle outliers via robust kernels.15 | - Initial estimate is *unscaled* until a GAB anchor arrives. - Can drift *if* not anchored (though V-SLAM minimizes this). | - A graph optimization library (Ceres). - A robust cost function. | **Excellent (Selected).** This is the *only* architecture that satisfies all user requirements for real-time streaming and asynchronous refinement. | + | **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory. | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails AC-7 and AC-8 completely. | - All images must be available before processing starts. - High RAM and CPU. | **Good (as an *Optional* Post-Processing Step).** Unsuitable as the primary online system, but could be offered as an optional, high-accuracy "Finalize Trajectory" batch process. | + + ### **4.2 Automatic Outlier Rejection (AC-3, AC-5)** + + The system must handle 350m outliers (AC-3) and \<10% bad GAB matches (AC-5). + + * **Mechanism (Robust Loss Functions):** A standard least-squares optimizer (like Ceres 15) would be catastrophically corrupted by a 350m error. The solution is to wrap *all* constraints in a **Robust Loss Function (e.g., HuberLoss, CauchyLoss)**.15 + * **Result:** A robust loss function mathematically *down-weights* the influence of constraints with large errors. When it "sees" the 350m error (AC-3), it effectively acknowledges the measurement but *refuses* to pull the entire 3000-image trajectory to fit this one "insane" data point. It automatically and gracefully *ignores* the outlier, optimizing the 99.9% of "sane" measurements. This is the modern, robust solution to AC-3 and AC-5. + + ## **5.0 High-Performance Compute & Deployment** + + The system must run on an RTX 2060 (AC-7) and process 6.2K images. These are opposing constraints. + + ### **5.1 Multi-Scale, Patch-Based Processing Pipeline** + + Running deep learning models (SuperPoint, LightGlue) on a full 6.2K (26-Megapixel) image will cause a CUDA Out-of-Memory (OOM) error and be impossibly slow. + + * **Mechanism (Coarse-to-Fine):** + 1. **For V-SLAM (Real-time, \<5s):** The V-SLAM front-end (Section 2.0) runs *only* on the Image_N_LR (e.g., 1536x1024) copy. This is fast enough to meet the AC-7 budget. + 2. **For GAB (High-Accuracy, Async):** The GAB (Section 3.0) uses the full-resolution Image_N_HR *selectively* to meet the 20m accuracy (AC-2). + * It first runs its coarse Siamese CNN 27 on the Image_N_LR. + * It then runs the SuperPoint detector on the *full 6.2K* image to find the *most confident* feature keypoints. + * It then extracts small, 256x256 *patches* from the *full-resolution* image, centered on these keypoints. + * It matches *these small, full-resolution patches* against the high-res satellite tile. + * **Result:** This hybrid method provides the fine-grained matching accuracy of the 6.2K image (needed for AC-2) without the catastrophic OOM errors or performance penalties. + + ### **5.2 Mandatory Deployment: NVIDIA TensorRT Acceleration** + + PyTorch is a research framework. For production, its inference speed is insufficient. + + * **Requirement:** The key neural networks (SuperPoint, LightGlue, Siamese CNN) *must* be converted from PyTorch into a highly-optimized **NVIDIA TensorRT engine**. + * **Research Validation:** 23 demonstrates this process for LightGlue, achieving "2x-4x speed gains over compiled PyTorch." 22 and 21 provide open-source repositories for SuperPoint+LightGlue conversion to ONNX and TensorRT. + * **Result:** This is not an "optional" optimization. It is a *mandatory* deployment step. This conversion (which applies layer fusion, graph optimization, and FP16 precision) is what makes achieving the \<5s (AC-7) performance *possible* on the specified RTX 2060 hardware.36 + + ## **6.0 System Robustness: Failure Mode Escalation Logic** + + This logic defines the system's behavior during real-world failures, ensuring it meets criteria AC-3, AC-4, AC-6, and AC-9. + + ### **6.1 Stage 1: Normal Operation (Tracking)** + + * **Condition:** V-SLAM front-end (Section 2.0) is healthy. + * **Logic:** + 1. V-SLAM successfully tracks Image_N_LR against its local keyframe map. + 2. A new Relative_Unscaled_Pose is sent to the TOH. + 3. TOH sends Pose_N_Est (unscaled) to the user (\<5s). + 4. If Image_N is selected as a new keyframe, the GAB (Section 3.0) is *queued* to find an Absolute_Metric_Anchor for it, which will trigger a Pose_N_Refined update later. + + ### **6.2 Stage 2: Transient VO Failure (Outlier Rejection)** + + * **Condition:** Image_N is unusable (e.g., severe blur, sun-glare, 350m outlier per AC-3). + * **Logic (Frame Skipping):** + 1. V-SLAM front-end fails to track Image_N_LR against the local map. + 2. The system *discards* Image_N (marking it as a rejected outlier, AC-5). + 3. When Image_N+1 arrives, the V-SLAM front-end attempts to track it against the *same* local keyframe map (from Image_N-1). + 4. **If successful:** Tracking resumes. Image_N is officially an outlier. The system "correctly continues the work" (AC-3 met). + 5. **If fails:** The system repeats for Image_N+2, N+3. If this fails for \~5 consecutive frames, it escalates to Stage 3. + + ### **6.3 Stage 3: Persistent VO Failure (Relocalization)** + + * **Condition:** Tracking is lost for multiple frames. This is the "sharp turn" (AC-4) or "low overlap" (AC-4) scenario. + * **Logic (Keyframe-Based Relocalization):** + 1. The V-SLAM front-end declares "Tracking Lost." + 2. **Critically:** It does *not* create a "new map chunk." + 3. Instead, it enters **Relocalization Mode**. For every new Image_N+k, it extracts features (SuperPoint) and queries the *entire* existing database of past keyframes for a match. + * **Resolution:** The UAV completes its sharp turn. Image_N+5 now has high overlap with Image_N-10 (from *before* the turn). + 1. The relocalization query finds a strong match. + 2. The V-SLAM front-end computes the 6-DoF pose of Image_N+5 relative to the *existing map*. + 3. Tracking is *resumed* seamlessly. The system "correctly continues the work" (AC-4 met). This is vastly more robust than the previous "map-merging" logic. + + ### **6.4 Stage 4: Catastrophic Failure (User Intervention)** + + * **Condition:** The system is in Stage 3 (Lost), but *also*, the **GAB (Section 3.0) has failed** to find *any* global anchors for a prolonged period (e.g., 20% of the route). This is the "absolutely incapable" scenario (AC-6), (e.g., heavy fog *and* over a featureless ocean). + * **Logic:** + 1. The system has an *unscaled* trajectory, and *zero* idea where it is in the world. + 2. The TOH triggers the AC-6 flag. + * **Resolution (User-Aided Prior):** + 1. The UI prompts the user: "Tracking lost. Please provide a coarse location for the *current* image." + 2. The user clicks *one point* on a map. + 3. This [Lat, Lon] is *not* taken as ground truth. It is fed to the **GAB (Section 3.1)** as a *strong prior* for its on-demand API query. + 4. This narrows the GAB's search area from "all of Ukraine" to "a 5km radius." This *guarantees* the GAB's Dual-Retrieval (Section 3.1) will fetch the *correct* satellite and DEM tiles, allowing the Hybrid Matcher (Section 3.2) to find a high-confidence Absolute_Metric_Anchor, which in turn re-scales (Section 4.1) and relocalizes the entire trajectory. + + ## **7.0 Output Generation and Validation Strategy** + + This section details how the final user-facing outputs are generated, specifically solving the "planar ground" output flaw, and how the system's compliance with all 10 ACs will be validated. + + ### **7.1 High-Accuracy Object Geolocalization via Ray-DEM Intersection** + + The "Ray-Plane Intersection" method is inaccurate for non-planar terrain 37 and is replaced with a high-accuracy ray-tracing method. This is the correct method for geolocating an object on the *non-planar* terrain visible in Images 1-9. + + * **Inputs:** + 1. User clicks pixel coordinate $(u,v)$ on Image_N. + 2. System retrieves the *final, refined, metric* 7-DoF pose $P = (R, T, s)$ for Image_N from the TOH. + 3. The system uses the known camera intrinsic matrix $K$. + 4. System retrieves the specific **30m DEM tile** 8 that was fetched by the GAB (Section 3.1) for this region of the map. This DEM is a 3D terrain mesh. + * **Algorithm (Ray-DEM Intersection):** + 1. **Un-project Pixel:** The 2D pixel $(u,v)$ is un-projected into a 3D ray *direction* vector $d_{cam}$ in the camera's local coordinate system: $d_{cam} = K^{-1} \\cdot [u, v, 1]^T$. + 2. **Transform Ray:** This ray direction $d_{cam}$ and origin (0,0,0) are transformed into the *global, metric* coordinate system using the pose $P$. This yields a ray originating at $T$ and traveling in direction $R \\cdot d_{cam}$. + 3. **Intersect:** The system performs a numerical *ray-mesh intersection* 39 to find the 3D point $(X, Y, Z)$ where this global ray *intersects the 3D terrain mesh* of the DEM. + 4. **Result:** This 3D intersection point $(X, Y, Z)$ is the *metric* world coordinate of the object *on the actual terrain*. + 5. **Convert:** This $(X, Y, Z)$ world coordinate is converted to a [Latitude, Longitude, Altitude] GPS coordinate. + + This method correctly accounts for terrain. A pixel aimed at the top of a hill will intersect the DEM at a high Z-value. A pixel aimed at the ravine (Image 1) will intersect at a low Z-value. This is the *only* method that can reliably meet the 20m accuracy (AC-2) for object localization. + + ### **7.2 Rigorous Validation Methodology** + + A comprehensive test plan is required. The foundation is a **Ground-Truth Test Harness** using the provided coordinates.csv.42 + + * **Test Harness:** + 1. **Ground-Truth Data:** The file coordinates.csv 42 provides ground-truth [Lat, Lon] for 60 images (e.g., AD000001.jpg...AD000060.jpg). + 2. **Test Datasets:** + * Test_Baseline_60 42: The 60 images and their coordinates. + * Test_Outlier_350m (AC-3): Test_Baseline_60 with a single, unrelated image inserted at frame 30. + * Test_Sharp_Turn_5pct (AC-4): A sequence where frames 20-24 are manually deleted, simulating a \<5% overlap jump. + * **Test Cases:** + * **Test_Accuracy (AC-1, AC-2, AC-5, AC-9):** + * **Run:** Execute GEORTEX-R on Test_Baseline_60, providing AD000001.jpg's coordinate (48.275292, 37.385220) as the Start Coordinate 42 + * **Script:** A validation script will compute the Haversine distance error between the *system's refined GPS output* for each image (2-60) and the *ground-truth GPS* from coordinates.csv. + * **ASSERT** (count(errors \< 50m) / 60) >= 0.80 **(AC-1 Met)** + * **ASSERT** (count(errors \< 20m) / 60) >= 0.60 **(AC-2 Met)** + * **ASSERT** (count(un-localized_images) / 60) \< 0.10 **(AC-5 Met)** + * **ASSERT** (count(localized_images) / 60) > 0.95 **(AC-9 Met)** + * **Test_MRE (AC-10):** + * **Run:** After Test_Baseline_60 completes. + * **ASSERT** TOH.final_Mean_Reprojection_Error \< 1.0 **(AC-10 Met)** + * **Test_Performance (AC-7, AC-8):** + * **Run:** Execute on a 1500-image sequence on the minimum-spec RTX 2060. + * **Log:** Log timestamps for "Image In" -> "Initial Pose Out". + * **ASSERT** average_time \< 5.0s **(AC-7 Met)** + * **Log:** Log the output stream. + * **ASSERT** >80% of images receive *two* poses: an "Initial" and a "Refined" **(AC-8 Met)** + * **Test_Robustness (AC-3, AC-4):** + * **Run:** Execute Test_Outlier_350m. + * **ASSERT** System logs "Stage 2: Discarding Outlier" and the final trajectory error for Image_31 is \< 50m **(AC-3 Met)**. + * **Run:** Execute Test_Sharp_Turn_5pct. + * **ASSERT** System logs "Stage 3: Tracking Lost" and "Relocalization Succeeded," and the final trajectory is complete and accurate **(AC-4 Met)**. + + + +Identify all potential weak points and problems. Address them and find out ways to solve them. Based on your findings, form a new solution draft in the same format. + +If your finding requires a complete reorganization of the flow and different components, state it. +Put all the findings regarding what was weak and poor at the beginning of the report. Put here all new findings, what was updated, replaced, or removed from the previous solution. + +Then form a new solution design without referencing the previous system. Remove Poor and Very Poor component choices from the component analysis tables, but leave Good and Excellent ones. +In the updated report, do not put "new" marks, do not compare to the previous solution draft, just make a new solution as if from scratch \ No newline at end of file diff --git a/docs/00_problem/acceptance_criteria.md b/docs/00_problem/acceptance_criteria.md index 1efe78d..28a6fdf 100644 --- a/docs/00_problem/acceptance_criteria.md +++ b/docs/00_problem/acceptance_criteria.md @@ -1,9 +1,19 @@ -- System should find out GPS of centers of 80% of the photos from the flight within error no more than 50 meters in comparison to the real GPS -- System should find out GPS of centers of 60% of the photos from the flight within error no more than 20 meters in comparison to the real GPS -- System should correctly continue the work even in a presence of up to 350 meters outlier photo between 2 consecutive photos en route. This could happen due to tilt of the plane. -- System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. Next photo should be in less than 150m drift and angle less than 50% -- Number of outliers during the satellite provider images ground check should be less than 10% -- In case of absolute incapable of the system to determine next, second next, and third next images gps, by any methods, (these 20% of the route), then it should ask user for an input for the next image, so that user can specify location -- Less than 2 seconds for processing one image -- Image Registration Rate > 95%. System can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory -- Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. \ No newline at end of file +- The system should find out the GPS of centers of 80% of the photos from the flight within an error of no more than 50 meters in comparison to the real GPS + +- The system should find out the GPS of centers of 60% of the photos from the flight within an error of no more than 20 meters in comparison to the real GPS + +- The system should correctly continue the work even in the presence of up to 350 meters of an outlier photo between 2 consecutive pictures en route. This could happen due to tilt of the plane. + +- System should correctly continue the work even during sharp turns, where the next photo doesn't overlap at all, or overlaps in less than 5%. The next photo should be in less than 200m drift and at an angle of less than 70% + +- System should try to operate when UAV made a sharp turn, and all the next photos has no common points with previous route. In that situation system should try to figure out location of the new piece of the route and connect it to the previous route. Also this separate chunks could be more than 2, so this strategy should be in the core of the system + +- In case of being absolutely incapable of determining the system to determine next, second next, and third next images GPS, by any means (these 20% of the route), then it should ask the user for input for the next image, so that the user can specify the location + +- Less than 5 seconds for processing one image + +- Results of image processing should appear immediately to user, so that user shouldn't wait for the whole route to complete in order to analyze first results. Also, system could refine existing calculated results and send refined results again to user + +- Image Registration Rate > 95%. The system can find enough matching features to confidently calculate the camera's 6-DoF pose (position and orientation) and "stitch" that image into the final trajectory + +- Mean Reprojection Error (MRE) < 1.0 pixels. The distance, in pixels, between the original pixel location of the object and the re-projected pixel location. diff --git a/docs/00_problem/problem_description.md b/docs/00_problem/problem_description.md index d36b140..3667140 100644 --- a/docs/00_problem/problem_description.md +++ b/docs/00_problem/problem_description.md @@ -1,3 +1,3 @@ -We have a lot of images taken from wing-type UAV by camera with at least FullHD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flight could be FullHD, f.e. -Photos are taken and named consecutively within 100 meters distance between each other. -We know only starting GPS coordinates. We need to determine coordinates of centers of each image. And also coordinates of the center of any object on these photos. We can use external satellite provider for ground check the existing photos \ No newline at end of file +We have a lot of images taken from a wing-type UAV using a camera with at least Full HD resolution. Resolution of each photo could be up to 6200*4100 for the whole flight, but for other flights, it could be FullHD +Photos are taken and named consecutively within 100 meters of each other. +We know only the starting GPS coordinates. We need to determine the GPS of the centers of each image. And also the coordinates of the center of any object in these photos. We can use an external satellite provider for ground checks on the existing photos \ No newline at end of file diff --git a/docs/00_problem/restrictions.md b/docs/00_problem/restrictions.md index ef99359..e1d27a8 100644 --- a/docs/00_problem/restrictions.md +++ b/docs/00_problem/restrictions.md @@ -1,10 +1,11 @@ - Photos are taken by only airplane type UAVs. - Photos are taken by the camera pointing downwards and fixed, but it is not autostabilized. - - The flying range is restricted by eastern and southern part of Ukraine (To the left of Dnipro river) - - The image resolution could be from FullHd to 6252*4168 - - Altitude is prefefined and no more than 1km + - The flying range is restricted by the eastern and southern parts of Ukraine (To the left of the Dnipro River) + - The image resolution could be from FullHD to 6252*4168. Camera parameters are known: focal length, sensor width, resolution and so on. + - Altitude is predefined and no more than 1km. The height of the terrain can be neglected. - There is NO data from IMU - Flights are done mostly in sunny weather - - We can use satellite providers, but we're limited right now to Google Maps, which could be possibly outdated for some regions - - Number of photos could be up to 3000, usually in 500-1500 range - - During the flight UAV can make sharp turns, so that it is possible that next photo is absolutely different from the previous one (no same objects), but it is rather exception than the rule \ No newline at end of file + - We can use satellite providers, but we're limited right now to Google Maps, which could be outdated for some regions + - Number of photos could be up to 3000, usually in the 500-1500 range + - During the flight, UAVs can make sharp turns, so that the next photo may be absolutely different from the previous one (no same objects), but it is rather an exception than the rule + - Processing is done on a stationary computer or laptop with NVidia GPU at least RTX2060, better 3070. (For the UAV solution Jetson Orin Nano would be used, but that is out of scope.) \ No newline at end of file diff --git a/docs/01_solution/01_solution_draft.md b/docs/01_solution/01_solution_draft.md new file mode 100644 index 0000000..e4cc1e9 --- /dev/null +++ b/docs/01_solution/01_solution_draft.md @@ -0,0 +1,288 @@ +# **GEo-Referenced Trajectory and Object Localization System (GEORTOLS): A Hybrid SLAM Architecture** + +## **1. Executive Summary** + +This report outlines the technical design for a robust, real-time geolocalization system. The objective is to determine the precise GPS coordinates for a sequence of high-resolution images (up to 6252x4168) captured by a fixed-wing, non-stabilized Unmanned Aerial Vehicle (UAV) [User Query]. The system must operate under severe constraints, including the absence of any IMU data, a predefined altitude of no more than 1km, and knowledge of only the starting GPS coordinate [User Query]. The system is required to handle significant in-flight challenges, such as sharp turns with minimal image overlap (<5%), frame-to-frame outliers of up to 350 meters, and operation over low-texture terrain as seen in the provided sample images [User Query, Image 1, Image 7]. + +The proposed solution is a **Hybrid Visual-Geolocalization SLAM (VG-SLAM)** architecture. This system is designed to meet the demanding acceptance criteria, including a sub-5-second initial processing time per image, streaming output with asynchronous refinement, and high-accuracy GPS localization (60% of photos within 20m error, 80% within 50m error) [User Query]. + +This hybrid architecture is necessitated by the problem's core constraints. The lack of an IMU makes a purely monocular Visual Odometry (VO) system susceptible to catastrophic scale drift.1 Therefore, the system integrates two cooperative sub-systems: + +1. A **Visual Odometry (VO) Front-End:** This component uses state-of-the-art deep-learning feature matchers (SuperPoint + SuperGlue/LightGlue) to provide fast, real-time *relative* pose estimates. This approach is selected for its proven robustness in low-texture environments where traditional features fail.4 This component delivers the initial, sub-5-second pose estimate. +2. A **Cross-View Geolocalization (CVGL) Module:** This component provides *absolute*, drift-free GPS pose estimates by matching UAV images against the available satellite provider (Google Maps).7 It functions as the system's "global loop closure" mechanism, correcting the VO's scale drift and, critically, relocalizing the UAV after tracking is lost during sharp turns or outlier frames [User Query]. + +These two systems run in parallel. A **Back-End Pose-Graph Optimizer** fuses their respective measurements—high-frequency relative poses from VO and high-confidence absolute poses from CVGL—into a single, globally consistent, and incrementally refined trajectory. This architecture directly satisfies the requirements for immediate, streaming results and subsequent asynchronous refinement [User Query]. + +## **2. Product Solution Description and Component Interaction** + +### **Product Solution Description** + +The proposed system, "GEo-Referenced Trajectory and Object Localization System (GEORTOLS)," is a real-time, streaming-capable software solution. It is designed for deployment on a stationary computer or laptop equipped with an NVIDIA GPU (RTX 2060 or better) [User Query]. + +* **Inputs:** + 1. A sequence of consecutively named monocular images (FullHD to 6252x4168). + 2. The absolute GPS coordinate (Latitude, Longitude) of the *first* image in the sequence. + 3. A pre-calibrated camera intrinsic matrix. + 4. Access to the Google Maps satellite imagery API. +* **Outputs:** + 1. A real-time, streaming feed of estimated GPS coordinates (Latitude, Longitude, Altitude) and 6-DoF poses (including Roll, Pitch, Yaw) for the center of each image. + 2. Asynchronous refinement messages for previously computed poses as the back-end optimizer improves the global trajectory. + 3. A service to provide the absolute GPS coordinate for any user-selected pixel coordinate (u,v) within any geolocated image. + +### **Component Interaction Diagram** + +The system is architected as four asynchronous, parallel-processing components to meet the stringent real-time and refinement requirements. + +1. **Image Ingestion & Pre-processing:** This module acts as the entry point. It receives the new, high-resolution image (Image N). It immediately creates scaled-down, lower-resolution (e.g., 1024x768) copies of the image for real-time processing by the VO and CVGL modules, while retaining the full-resolution original for object-level GPS lookups. +2. **Visual Odometry (VO) Front-End:** This module's sole task is high-speed, frame-to-frame relative pose estimation. It maintains a short-term "sliding window" of features, matching Image N to Image N-1. It uses GPU-accelerated deep-learning models (SuperPoint + SuperGlue) to find feature matches and calculates the 6-DoF relative transform. This result is immediately sent to the Back-End. +3. **Cross-View Geolocalization (CVGL) Module:** This is a heavier, slower, asynchronous module. It takes the pre-processed Image N and queries the Google Maps database to find an *absolute* GPS pose. This involves a two-stage retrieval-and-match process. When a high-confidence match is found, its absolute pose is sent to the Back-End as a "global-pose constraint." +4. **Trajectory Optimization Back-End:** This is the system's central "brain," managing the complete pose graph.10 It receives two types of data: + * *High-frequency, low-confidence relative poses* from the VO Front-End. + * Low-frequency, high-confidence absolute poses from the CVGL Module. + It continuously fuses these constraints in a pose-graph optimization framework (e.g., g2o or Ceres Solver). When the VO Front-End provides a new relative pose, it is quickly added to the graph to produce the "Initial Pose" (<5s). When the CVGL Module provides a new absolute pose, it triggers a more comprehensive re-optimization of the entire graph, correcting drift and broadcasting "Refined Poses" to the user.11 + +## **3. Core Architectural Framework: Hybrid Visual-Geolocalization SLAM (VG-SLAM)** + +### **Rationale for the Hybrid Approach** + +The core constraints of this problem—monocular, IMU-less flight over potentially long distances (up to 3000 images at \~100m intervals equates to a 300km flight) [User Query]—render simple solutions unviable. + +A **VO-Only** system is guaranteed to fail. Monocular Visual Odometry (and SLAM) suffers from an inherent, unobservable ambiguity: the *scale* of the world.1 Because there is no IMU to provide an accelerometer-based scale reference or a gravity vector 12, the system has no way to know if it moved 1 meter or 10 meters. This leads to compounding scale drift, where the entire trajectory will grow or shrink over time.3 Over a 300km flight, the resulting positional error would be measured in kilometers, not the 20-50 meters required [User Query]. + +A **CVGL-Only** system is also unviable. Cross-View Geolocalization (CVGL) matches the UAV image to a satellite map to find an absolute pose.7 While this is drift-free, it is a large-scale image retrieval problem. Querying the entire map of Ukraine for a match for every single frame is computationally impossible within the <5 second time limit.13 Furthermore, this approach is brittle; if the Google Maps data is outdated (a specific user restriction) [User Query], the CVGL match will fail, and the system would have no pose estimate at all. + +Therefore, the **Hybrid VG-SLAM** architecture is the only robust solution. + +* The **VO Front-End** provides the fast, high-frequency relative motion. It works even if the satellite map is outdated, as it tracks features in the *real*, current world. +* The **CVGL Module** acts as the *only* mechanism for scale correction and absolute georeferencing. It provides periodic, drift-free "anchors" to the real-world GPS coordinates. +* The **Back-End Optimizer** fuses these two data streams. The CVGL poses function as "global loop closures" in the SLAM pose graph. They correct the scale drift accumulated by the VO and, critically, serve to relocalize the system after a "kidnapping" event, such as the specified sharp turns or 350m outliers [User Query]. + +### **Data Flow for Streaming and Refinement** + +This architecture is explicitly designed to meet the <5s initial output and asynchronous refinement criteria [User Query]. The data flow for a single image (Image N) is as follows: + +* **T \= 0.0s:** Image N (6200x4100) is received by the **Ingestion Module**. +* **T \= 0.2s:** Image N is pre-processed (scaled to 1024px) and passed to the VO and CVGL modules. +* **T \= 1.0s:** The **VO Front-End** completes GPU-accelerated matching (SuperPoint+SuperGlue) of Image N -> Image N-1. It computes the Relative_Pose(N-1 -> N). +* **T \= 1.1s:** The **Back-End Optimizer** receives this Relative_Pose. It appends this pose to the graph relative to the last known pose of N-1. +* **T \= 1.2s:** The Back-End broadcasts the **Initial Pose_N_Est** to the user interface. (**<5s criterion met**). +* **(Parallel Thread) T \= 1.5s:** The **CVGL Module** (on a separate thread) begins its two-stage search for Image N against the Google Maps database. +* **(Parallel Thread) T \= 6.0s:** The CVGL Module successfully finds a high-confidence Absolute_Pose_N_Abs from the satellite match. +* **T \= 6.1s:** The **Back-End Optimizer** receives this new, high-confidence absolute constraint for Image N. +* **T \= 6.2s:** The Back-End triggers a graph re-optimization. This new "anchor" corrects any scale or positional drift for Image N and all surrounding poses in the graph. +* **T \= 6.3s:** The Back-End broadcasts a **Pose_N_Refined** (and Pose_N-1_Refined, Pose_N-2_Refined, etc.) to the user interface. (**Refinement criterion met**). + +## **4. Component Analysis: Front-End (Visual Odometry and Relocalization)** + +The task of the VO Front-End is to rapidly and robustly estimate the 6-DoF relative motion between consecutive frames. This component's success is paramount for the high-frequency tracking required to meet the <5s criterion. + +The primary challenge is the nature of the imagery. The specified operational area and sample images (e.g., Image 1, Image 7) show vast, low-texture agricultural fields [User Query]. These environments are a known failure case for traditional, gradient-based feature extractors like SIFT or ORB, which rely on high-gradient corners and cannot find stable features in "weak texture areas".5 Furthermore, the non-stabilized camera [User Query] will introduce significant rotational motion and viewpoint change, breaking the assumptions of many simple trackers.16 + +Deep-learning (DL) based feature extractors and matchers have been developed specifically to overcome these "challenging visual conditions".5 Models like SuperPoint, SuperGlue, and LoFTR are trained to find more robust and repeatable features, even in low-texture scenes.4 + +### **Table 1: Analysis of State-of-the-Art Feature Extraction and Matching Techniques** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **SIFT + BFMatcher/FLANN** (OpenCV) | - Scale and rotation invariant. - High-quality, robust matches. - Well-studied and mature.15 | - Computationally slow (CPU-based). - Poor performance in low-texture or weakly-textured areas.14 - Patented (though expired). | - High-contrast, well-defined features. | **Poor.** Too slow for the <5s target and will fail to find features in the low-texture agricultural landscapes shown in sample images. | +| **ORB + BFMatcher** (OpenCV) | - Extremely fast and lightweight. - Standard for real-time SLAM (e.g., ORB-SLAM).21 - Rotation invariant. | - *Not* scale invariant (uses a pyramid). - Performs very poorly in low-texture scenes.5 - Unstable in high-blur scenarios. | - CPU, lightweight. - High-gradient corners. | **Very Poor.** While fast, it fails on the *robustness* requirement. It is designed for textured, indoor/urban scenes, not sparse, natural terrain. | +| **SuperPoint + SuperGlue** (PyTorch, C++/TensorRT) | - SOTA robustness in low-texture, high-blur, and challenging conditions.4 - End-to-end learning for detection and matching.24 - Multiple open-source SLAM integrations exist (e.g., SuperSLAM).25 | - Requires a powerful GPU for real-time performance. - Sparse feature-based (not dense). | - NVIDIA GPU (RTX 2060+). - PyTorch (research) or TensorRT (deployment).26 | **Excellent.** This approach is *designed* for the exact "challenging conditions" of this problem. It provides SOTA robustness in low-texture scenes.4 The user's hardware (RTX 2060+) meets the requirements. | +| **LoFTR** (PyTorch) | - Detector-free dense matching.14 - Extremely robust to viewpoint and texture challenges.14 - Excellent performance on natural terrain and low-overlap images.19 | - High computational and VRAM cost. - Can cause CUDA Out-of-Memory (OOM) errors on very high-resolution images.30 - Slower than sparse-feature methods. | - High-end NVIDIA GPU. - PyTorch. | **Good, but Risky.** While its robustness is excellent, its dense, Transformer-based nature makes it vulnerable to OOM errors on the 6252x4168 images.30 The sparse SuperPoint approach is a safer, more-scalable choice for the VO front-end. | + +### **Selected Approach (VO Front-End): SuperPoint + SuperGlue/LightGlue** + +The selected approach is a VO front-end based on **SuperPoint** for feature extraction and **SuperGlue** (or its faster successor, **LightGlue**) for matching.18 + +* **Robustness:** This combination is proven to provide superior robustness and accuracy in sparse-texture scenes, extracting more and higher-quality matches than ORB.4 +* **Performance:** It is designed for GPU acceleration and is used in SOTA real-time SLAM systems, demonstrating its feasibility within the <5s target on an RTX 2060.25 +* **Scalability:** As a sparse-feature method, it avoids the memory-scaling issues of dense matchers like LoFTR when faced with the user's maximum 6252x4168 resolution.30 The image can be downscaled for real-time VO, and SuperPoint will still find stable features. + +## **5. Component Analysis: Back-End (Trajectory Optimization and Refinement)** + +The task of the Back-End is to fuse all incoming measurements (high-frequency/low-accuracy relative VO poses, low-frequency/high-accuracy absolute CVGL poses) into a single, globally consistent trajectory. This component's design is dictated by the user's real-time streaming and refinement requirements [User Query]. + +A critical architectural choice must be made between a traditional, batch **Structure from Motion (SfM)** pipeline and a real-time **SLAM (Simultaneous Localization and Mapping)** pipeline. + +* **Batch SfM:** (e.g., COLMAP).32 This approach is an offline process. It collects all 1500-3000 images, performs feature matching, and then runs a large, non-real-time "Bundle Adjustment" (BA) to solve for all camera poses and 3D points simultaneously.35 While this produces the most accurate possible result, it can take hours to compute. It *cannot* meet the <5s/image or "immediate results" criteria. +* **Real-time SLAM:** (e.g., ORB-SLAM3).28 This approach is *online* and *incremental*. It maintains a "pose graph" of the trajectory.10 It provides an immediate pose estimate based on the VO front-end. When a new, high-quality measurement arrives (like a loop closure 37, or in our case, a CVGL fix), it triggers a fast re-optimization of the graph, publishing a *refined* result.11 + +The user's requirements for "results...appear immediately" and "system could refine existing calculated results" [User Query] are a textbook description of a real-time SLAM back-end. + +### **Table 2: Analysis of Trajectory Optimization Strategies** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Incremental SLAM (Pose-Graph Optimization)** (g2o, Ceres Solver, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates. - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (CVGL) data arrives.10 - Meets the <5s and streaming criteria. | - Initial estimate is less accurate than a full batch process. - Susceptible to drift *until* a loop closure (CVGL fix) is made. | - A graph optimization library (g2o, Ceres). - A robust cost function to reject outliers. | **Excellent.** This is the *only* architecture that satisfies the user's real-time streaming and asynchronous refinement constraints. | +| **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory.35 - Can import custom DL matches.38 | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails all timing and streaming criteria. | - All images must be available before processing starts. - High RAM and CPU. | **Unsuitable (for the *online* system).** This approach is ideal for an *optional, post-flight, high-accuracy* refinement, but it cannot be the primary system. | + +### **Selected Approach (Back-End): Incremental Pose-Graph Optimization (g2o/Ceres)** + +The system's back-end will be built as an **Incremental Pose-Graph Optimizer** using a library like **g2o** or **Ceres Solver**. This is the only way to meet the real-time streaming and refinement constraints [User Query]. + +The graph will contain: + +* **Nodes:** The 6-DoF pose of each camera frame. +* **Edges (Constraints):** + 1. **Odometry Edges:** Relative 6-DoF transforms from the VO Front-End (SuperPoint+SuperGlue). These are high-frequency but have accumulating drift/scale error. + 2. **Georeferencing Edges:** Absolute 6-DoF poses from the CVGL Module. These are low-frequency but are drift-free and provide the absolute scale. + 3. **Start-Point Edge:** A high-confidence absolute pose for Image 1, fixed to the user-provided start GPS. + +This architecture allows the system to provide an immediate estimate (from odometry) and then drastically improve its accuracy (correcting scale and drift) whenever a new georeferencing edge is added. + +## **6. Component Analysis: Global-Pose Correction (Georeferencing Module)** + +This module is the most critical component for meeting the accuracy requirements. Its task is to provide absolute GPS pose estimates by matching the UAV's nadir-pointing-but-non-stabilized images to the Google Maps satellite provider [User Query]. This is the only component that can correct the monocular scale drift. + +This task is known as **Cross-View Geolocalization (CVGL)**.7 It is extremely challenging due to the "domain gap" 44 between the two image sources: + +1. **Viewpoint:** The UAV is at low altitude (<1km) and non-nadir (due to fixed-wing tilt) 45, while the satellite is at a very high altitude and is perfectly nadir. +2. **Appearance:** The images come from different sensors, with different lighting (shadows), and at different times. The Google Maps data may be "outdated" [User Query], showing different seasons, vegetation, or man-made structures.47 + +A simple, brute-force feature match is computationally impossible. The solution is a **hierarchical, two-stage approach** that mimics SOTA research 7: + +* **Stage 1: Coarse Retrieval.** We cannot run expensive matching against the entire map. Instead, we treat this as an image retrieval problem. We use a Deep Learning model (e.g., a Siamese or Dual CNN trained on this task 50) to generate a compact "embedding vector" (a digital signature) for the UAV image. In an offline step, we pre-compute embeddings for *all* satellite map tiles in the operational area. The UAV image's embedding is then used to perform a very fast (e.g., FAISS library) similarity search against the satellite database, returning the Top-K most likely-matching satellite tiles. +* **Stage 2: Fine-Grained Pose.** *Only* for these Top-K candidates do we perform the heavy-duty feature matching. We use our selected **SuperPoint+SuperGlue** matcher 53 to find precise correspondences between the UAV image and the K satellite tiles. If a high-confidence geometric match (e.g., >50 inliers) is found, we can compute the precise 6-DoF pose of the UAV relative to that tile, thus yielding an absolute GPS coordinate. + +### **Table 3: Analysis of State-of-the-Art Cross-View Geolocalization (CVGL) Techniques** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Coarse Retrieval (Siamese/Dual CNNs)** (PyTorch, ResNet18) | - Extremely fast for retrieval (database lookup). - Learns features robust to seasonal and appearance changes.50 - Narrows search space from millions to a few. | - Does *not* provide a precise 6-DoF pose, only a "best match" tile. - Requires training on a dataset of matched UAV-satellite pairs. | - Pre-trained model (e.g., on ResNet18).52 - Pre-computed satellite embedding database. | **Essential (as Stage 1).** This is the only computationally feasible way to "find" the UAV on the map. | +| **Fine-Grained Feature Matching** (SuperPoint + SuperGlue) | - Provides a highly-accurate 6-Dof pose estimate.53 - Re-uses the same robust matcher from the VO Front-End.54 | - Too slow to run on the entire map. - *Requires* a good initial guess (from Stage 1) to be effective. | - NVIDIA GPU. - Top-K candidate tiles from Stage 1. | **Essential (as Stage 2).** This is the component that actually computes the precise GPS pose from the coarse candidates. | +| **End-to-End DL Models (Transformers)** (PFED, ReCOT, etc.) | - SOTA accuracy in recent benchmarks.13 - Can be highly efficient (e.g., PFED).13 - Can perform retrieval and pose estimation in one model. | - Often research-grade, not robustly open-sourced. - May be complex to train and deploy. - Less modular and harder to debug than the two-stage approach. | - Specific, complex model architectures.13 - Large-scale training datasets. | **Not Recommended (for initial build).** While powerful, these are less practical for a version 1 build. The two-stage approach is more modular, debuggable, and uses components already required by the VO system. | + +### **Selected Approach (CVGL Module): Hierarchical Retrieval + Matching** + +The CVGL module will be implemented as a two-stage hierarchical system: + +1. **Stage 1 (Coarse):** A **Siamese CNN** 52 (or similar model) generates an embedding for the UAV image. This embedding is used to retrieve the Top-5 most similar satellite tiles from a pre-computed database. +2. **Stage 2 (Fine):** The **SuperPoint+SuperGlue** matcher 53 is run between the UAV image and these 5 tiles. The match with the highest inlier count and lowest reprojection error is used to calculate the absolute 6-DoF pose, which is then sent to the Back-End optimizer. + +## **7. Addressing Critical Acceptance Criteria and Failure Modes** + +This hybrid architecture's logic is designed to handle the most difficult acceptance criteria [User Query] through a robust, multi-stage escalation process. + +### **Stage 1: Initial State (Normal Operation)** + +* **Condition:** VO(N-1 -> N) succeeds. +* **System Logic:** The **VO Front-End** provides the high-frequency relative pose. This is added to the graph, and the **Initial Pose** is sent to the user (<5s). +* **Resolution:** The **CVGL Module** runs asynchronously to provide a Refined Pose later, which corrects for scale drift. + +### **Stage 2: Transient Failure / Outlier Handling (AC-3)** + +* **Condition:** VO(N-1 -> N) fails (e.g., >350m jump, severe motion blur, low overlap) [User Query]. This triggers an immediate, high-priority CVGL(N) query. +* **System Logic:** + 1. If CVGL(N) *succeeds*, the system has conflicting data: a failed VO link and a successful CVGL pose. The **Back-End Optimizer** uses a robust kernel to reject the high-error VO link as an outlier and accepts the CVGL pose.56 The trajectory "jumps" to the correct location, and VO resumes from Image N+1. + 2. If CVGL(N) *also fails* (e.g., due to cloud cover or outdated map), the system assumes Image N is a single bad frame (an outlier). +* **Resolution (Frame Skipping):** The system buffers Image N and, upon receiving Image N+1, the **VO Front-End** attempts to "bridge the gap" by matching VO(N-1 -> N+1). + * **If successful,** a pose for N+1 is found. Image N is marked as a rejected outlier, and the system continues. + * **If VO(N-1 -> N+1) fails,** it repeats for VO(N-1 -> N+2). + * If this "bridging" fails for 3 consecutive frames, the system concludes it is not a transient outlier but a persistent tracking loss. This escalates to Stage 3. + +### **Stage 3: Persistent Tracking Loss / Sharp Turn Handling (AC-4)** + +* **Condition:** VO tracking is lost, and the "frame-skipping" in Stage 2 fails (e.g., a "sharp turn" with no overlap) [User Query]. +* **System Logic (Multi-Map "Chunking"):** The **Back-End Optimizer** declares a "Tracking Lost" state and creates a *new, independent map* ("Chunk 2"). + * The **VO Front-End** is re-initialized and begins populating this new chunk, tracking VO(N+3 -> N+4), VO(N+4 -> N+5), etc. This new chunk is internally consistent but has no absolute GPS position (it is "floating"). +* **Resolution (Asynchronous Relocalization):** + 1. The **CVGL Module** now runs asynchronously on all frames in this new "Chunk 2". + 2. Crucially, it uses the last known GPS coordinate from "Chunk 1" as a *search prior*, narrowing the satellite map search area to the vicinity. + 3. The system continues to build Chunk 2 until the CVGL module successfully finds a high-confidence Absolute_Pose for *any* frame in that chunk (e.g., for Image N+20). + 4. Once this single GPS "anchor" is found, the **Back-End Optimizer** performs a full graph optimization. It calculates the 7-DoF transformation (3D position, 3D rotation, and **scale**) to align all of Chunk 2 and merge it with Chunk 1. + 5. This "chunking" method robustly handles the "correctly continue the work" criterion by allowing the system to keep tracking locally even while globally lost, confident it can merge the maps later. + +### **Stage 4: Catastrophic Failure / User Intervention (AC-6)** + +* **Condition:** The system has entered Stage 3 and is building "Chunk 2," but the **CVGL Module** has *also* failed for a prolonged period (e.g., 20% of the route, or 50+ consecutive frames) [User Query]. This is a "worst-case" scenario where the UAV is in an area with no VO features (e.g., over a lake) *and* no CVGL features (e.g., heavy clouds or outdated maps). +* **System Logic:** The system is "absolutely incapable" of determining its pose. +* **Resolution (User Input):** The system triggers the "ask the user for input" event. A UI prompt will show the last known good image (from Chunk 1) on the map and the new, "lost" image (e.g., N+50). It will ask the user to "Click on the map to provide a coarse location." This user-provided GPS point is then fed to the CVGL module as a *strong prior*, drastically narrowing the search space and enabling it to re-acquire a lock. + +## **8. Implementation and Output Generation** + +### **Real-time Workflow (<5s Initial, Async Refinement)** + +A concrete implementation plan for processing Image N: + +1. **T=0.0s:** Image[N] (6200px) received. +2. **T=0.1s:** Image pre-processed: Scaled to 1024px for VO/CVGL. Full-res original stored. +3. **T=0.5s:** **VO Front-End** (GPU): SuperPoint features extracted for 1024px image. +4. **T=1.0s:** **VO Front-End** (GPU): SuperGlue matches 1024px Image[N] -> 1024px Image[N-1]. Relative_Pose (6-DoF) estimated via RANSAC/PnP. +5. **T=1.1s:** **Back-End:** Relative_Pose added to graph. Optimizer updates trajectory. +6. **T=1.2s:** **OUTPUT:** Initial Pose_N_Est (GPS) sent to user. **(<5s criterion met)**. +7. **T=1.3s:** **CVGL Module (Async Task)** (GPU): Siamese/Dual CNN generates embedding for 1024px Image[N]. +8. **T=1.5s:** **CVGL Module (Async Task):** Coarse retrieval (FAISS lookup) returns Top-5 satellite tile candidates. +9. **T=4.0s:** **CVGL Module (Async Task)** (GPU): Fine-grained matching. SuperPoint+SuperGlue runs 5 times (Image[N] vs. 5 satellite tiles). +10. **T=4.5s:** **CVGL Module (Async Task):** A high-confidence match is found. Absolute_Pose_N_Abs (6-DoF) is computed. +11. **T=4.6s:** **Back-End:** High-confidence Absolute_Pose_N_Abs added to pose graph. Graph re-optimization is triggered. +12. **T=4.8s:** **OUTPUT:** Pose_N_Refined (GPS) sent to user. **(Refinement criterion met)**. + +### **Determining Object-Level GPS (from Pixel Coordinate)** + +The requirement to find the "coordinates of the center of any object in these photos" [User Query] is met by projecting a pixel to its 3D world coordinate. This requires the (u,v) pixel, the camera's 6-DoF pose, and the camera's intrinsic matrix (K). + +Two methods will be implemented to support the streaming/refinement architecture: + +1. **Method 1 (Immediate, <5s): Flat-Earth Projection.** + * When the user clicks pixel (u,v) on Image[N], the system uses the *Initial Pose_N_Est*. + * It assumes the ground is a flat plane at the predefined altitude (e.g., 900m altitude if flying at 1km and ground is at 100m) [User Query]. + * It computes the 3D ray from the camera center through (u,v) using the intrinsic matrix (K). + * It calculates the 3D intersection point of this ray with the flat ground plane. + * This 3D world point is converted to a GPS coordinate and sent to the user. This is very fast but less accurate in non-flat terrain. +2. **Method 2 (Refined, Post-BA): Structure-from-Motion Projection.** + * The Back-End's pose-graph optimization, as a byproduct, will create a sparse 3D point cloud of the world (i.e., the "SfM" part of SLAM).35 + * When the user clicks (u,v), the system uses the *Pose_N_Refined*. + * It raycasts from the camera center through (u,v) and finds the 3D intersection point with the *actual 3D point cloud* generated by the system. + * This 3D point's coordinate (X,Y,Z) is converted to GPS. This is far more accurate as it accounts for real-world topography (hills, ditches) captured in the 3D map. + +## **9. Testing and Validation Strategy** + +A rigorous testing strategy is required to validate all 10 acceptance criteria. The foundation of this strategy is the creation of a **Ground-Truth Test Dataset**. This will involve flying several test routes and manually creating a "checkpoint" (CP) file, similar to the provided coordinates.csv 58, using a high-precision RTK/PPK GPS. This provides the "real GPS" for validation.59 + +### **Accuracy Validation Methodology (AC-1, AC-2, AC-5, AC-8, AC-9)** + +These tests validate the system's accuracy and completion metrics.59 + +1. A test flight of 1000 images with high-precision ground-truth CPs is prepared. +2. The system is run given only the first GPS coordinate. +3. A test script compares the system's *final refined GPS output* for each image against its *ground-truth CP*. The Haversine distance (error in meters) is calculated for all 1000 images. +4. This yields a list of 1000 error values. +5. **Test_Accuracy_50m (AC-1):** ASSERT (count(errors < 50m) / 1000) >= 0.80 +6. **Test_Accuracy_20m (AC-2):** ASSERT (count(errors < 20m) / 1000) >= 0.60 +7. **Test_Outlier_Rate (AC-5):** ASSERT (count(un-localized_images) / 1000) < 0.10 +8. **Test_Image_Registration_Rate (AC-8):** ASSERT (count(localized_images) / 1000) > 0.95 +9. **Test_Mean_Reprojection_Error (AC-9):** ASSERT (Back-End.final_MRE) < 1.0 +10. **Test_RMSE:** The overall Root Mean Square Error (RMSE) of the entire trajectory will be calculated as a primary performance benchmark.59 + +### **Integration and Functional Tests (AC-3, AC-4, AC-6)** + +These tests validate the system's logic and robustness to failure modes.62 + +* Test_Low_Overlap_Relocalization (AC-4): + * **Setup:** Create a test sequence of 50 images. From this, manually delete images 20-24 (simulating 5 lost frames during a sharp turn).63 + * **Test:** Run the system on this "broken" sequence. + * **Pass/Fail:** The system must report "Tracking Lost" at frame 20, initiate a new "chunk," and then "Tracking Re-acquired" and "Maps Merged" when the CVGL module successfully localizes frame 25 (or a subsequent frame). The final trajectory error for frame 25 must be < 50m. +* Test_350m_Outlier_Rejection (AC-3): + * **Setup:** Create a test sequence. At image 30, insert a "rogue" image (Image 30b) known to be 350m away. + * **Test:** Run the system on this sequence (..., 29, 30, 30b, 31,...). + * **Pass/Fail:** The system must correctly identify Image 30b as an outlier (RANSAC failure 56), reject it (or jump to its CVGL-verified pose), and "correctly continue the work" by successfully tracking Image 31 from Image 30 (using the frame-skipping logic). The trajectory must not be corrupted. +* Test_User_Intervention_Prompt (AC-6): + * **Setup:** Create a test sequence with 50 consecutive "bad" frames (e.g., pure sky, lens cap) to ensure the transient and chunking logics are bypassed. + * **Test:** Run the system. + * **Pass/Fail:** The system must enter a "LOST" state, attempt and fail to relocalize via CVGL for 50 frames, and then correctly trigger the "ask for user input" event. + +### **Non-Functional Tests (AC-7, AC-8, Hardware)** + +These tests validate performance and resource requirements.66 + +* Test_Performance_Per_Image (AC-7): + * **Setup:** Run the 1000-image test set on the minimum-spec RTX 2060. + * **Test:** Measure the time from "Image In" to "Initial Pose Out" for every frame. + * **Pass/Fail:** ASSERT average_time < 5.0s. +* Test_Streaming_Refinement (AC-8): + * **Setup:** Run the 1000-image test set. + * **Test:** A logger must verify that *two* poses are received for >80% of images: an "Initial" pose (T < 5s) and a "Refined" pose (T > 5s, after CVGL). + * **Pass/Fail:** The refinement mechanism is functioning correctly. +* Test_Scalability_Large_Route (Constraints): + * **Setup:** Run the system on a full 3000-image dataset. + * **Test:** Monitor system RAM, VRAM, and processing time per frame over the entire run. + * **Pass/Fail:** The system must complete the run without memory leaks, and the processing time per image must not degrade significantly as the pose graph grows. diff --git a/docs/01_solution/01_solution_draft_claude.md b/docs/01_solution/01_solution_draft_claude.md deleted file mode 100644 index 6221cf0..0000000 --- a/docs/01_solution/01_solution_draft_claude.md +++ /dev/null @@ -1,559 +0,0 @@ -# UAV Aerial Image Geolocation System - Solution Draft - -## 1. Product Solution Description - -### Overview -The system is a **hybrid Visual Odometry + Cross-View Matching pipeline** for GPS-denied aerial image geolocation. It combines: -- **Incremental Visual Odometry (VO)** for relative pose estimation between consecutive frames -- **Periodic Satellite Map Registration** to correct accumulated drift -- **Structure from Motion (SfM)** for trajectory refinement -- **Deep Learning-based Cross-View Matching** for absolute geolocation - -### Core Components - -#### 1.1 Visual Odometry Pipeline -Modern visual odometry approaches for UAVs use downward-facing cameras to track motion by analyzing changes in feature positions between consecutive frames, with correction methods using satellite imagery to reduce accumulated error. - -**Key Features:** -- Monocular camera with planar ground assumption -- Feature tracking using modern deep learning approaches -- Scale recovery using altitude information (≤1km) -- Drift correction via satellite image matching - -#### 1.2 Cross-View Matching Engine -Cross-view geolocation matches aerial UAV images with georeferenced satellite images through coarse-to-fine matching stages, using deep learning networks to handle scale and illumination differences. - -**Workflow:** -1. **Coarse Matching**: Global descriptor extraction (NetVLAD) to find candidate regions -2. **Fine Matching**: Local feature matching within candidates -3. **Pose Estimation**: Homography/EPnP+RANSAC for geographic pose - -#### 1.3 Structure from Motion (SfM) -Structure from Motion uses multiple overlapping images to reconstruct 3D structure and camera poses, automatically performing camera calibration and requiring only 60% vertical overlap between images. - -**Implementation:** -- Bundle adjustment for trajectory optimization -- Incremental reconstruction for online processing -- Multi-view stereo for terrain modeling (optional) - -## 2. Architecture Approach - -### 2.1 System Architecture - -``` -┌─────────────────────────────────────────────────────────────┐ -│ Input Layer │ -│ - Sequential UAV Images (500-3000) │ -│ - Starting GPS Coordinates │ -│ - Flight Metadata (altitude, camera params) │ -└──────────────────┬──────────────────────────────────────────┘ - │ -┌──────────────────▼──────────────────────────────────────────┐ -│ Feature Extraction Module │ -│ ┌──────────────────────────────────────────────────────┐ │ -│ │ Primary: SuperPoint + LightGlue (GPU) │ │ -│ │ Fallback: SIFT + FLANN (CPU) │ │ -│ │ Target: 1024-2048 keypoints/image │ │ -│ └──────────────────────────────────────────────────────┘ │ -└──────────────────┬──────────────────────────────────────────┘ - │ -┌──────────────────▼──────────────────────────────────────────┐ -│ Sequential Processing Pipeline │ -│ │ -│ ┌────────────────────────────────────────┐ │ -│ │ 1. Visual Odometry Tracker │ │ -│ │ - Frame-to-frame matching │ │ -│ │ - Relative pose estimation │ │ -│ │ - Scale recovery (altitude) │ │ -│ │ - Outlier detection (350m check) │ │ -│ └──────────────┬─────────────────────────┘ │ -│ │ │ -│ ┌──────────────▼─────────────────────────┐ │ -│ │ 2. Incremental SfM (COLMAP-based) │ │ -│ │ - Bundle adjustment every N frames │ │ -│ │ - Track management │ │ -│ │ - Camera pose refinement │ │ -│ └──────────────┬─────────────────────────┘ │ -│ │ │ -│ ┌──────────────▼─────────────────────────┐ │ -│ │ 3. Satellite Registration Module │ │ -│ │ - Triggered every 10-20 frames │ │ -│ │ - Cross-view matching │ │ -│ │ - Drift correction │ │ -│ │ - GPS coordinate assignment │ │ -│ └──────────────┬─────────────────────────┘ │ -└─────────────────┼─────────────────────────────────────────┘ - │ -┌─────────────────▼─────────────────────────────────────────┐ -│ Fallback & Quality Control │ -│ - Sharp turn detection (overlap <5%) │ -│ - User intervention request (<20% failure cases) │ -│ - Quality metrics logging (MRE, registration rate) │ -└─────────────────┬─────────────────────────────────────────┘ - │ -┌─────────────────▼─────────────────────────────────────────┐ -│ Output Layer │ -│ - GPS coordinates for each image center │ -│ - 6-DoF camera poses │ -│ - Confidence scores │ -│ - Sparse 3D point cloud │ -└────────────────────────────────────────────────────────────┘ -``` - -### 2.2 Technical Implementation - -#### Feature Extraction & Matching -LightGlue provides efficient local feature matching with adaptive inference, processing at 150 FPS for 1024 keypoints and outperforming SuperGlue in both speed and accuracy, making it suitable for real-time applications. - -**Primary Stack:** -- **Feature Detector**: SuperPoint (256-D descriptors, rotation invariant) -- **Feature Matcher**: LightGlue (adaptive inference, early termination) -- **Alternative**: DISK + LightGlue for better outdoor performance - -**Configuration:** -```python -# SuperPoint + LightGlue configuration -extractor = SuperPoint(max_num_keypoints=1024) -matcher = LightGlue( - features='superpoint', - depth_confidence=0.9, - width_confidence=0.95, - flash_attention=True # 4-10x speedup -) -``` - -#### Visual Odometry Component -Visual odometry for high-altitude flights often assumes locally flat ground and solves motion through planar homography between ground images, with the scale determined by vehicle elevation. - -**Method:** -1. Extract features from consecutive frames (i, i+1) -2. Match features using LightGlue -3. Apply RANSAC for outlier rejection -4. Compute essential matrix -5. Recover relative pose (R, t) -6. Scale using altitude: `scale = altitude / focal_length` -7. Update trajectory - -**Outlier Handling:** -- Distance check: reject if displacement >350m between consecutive frames -- Overlap check: require >5% feature overlap or trigger satellite matching -- Angle threshold: <50° rotation between frames - -#### Cross-View Satellite Matching -Cross-view geolocation uses transformers with self-attention and cross-attention mechanisms to match drone images with satellite imagery, employing coarse-to-fine strategies with global descriptors like NetVLAD. - -**Architecture:** -``` -Offline Preparation: -1. Download Google Maps tiles for flight region -2. Build spatial quad-tree index -3. Extract NetVLAD global descriptors (4096-D) -4. Store in efficient retrieval database - -Online Processing (every 10-20 frames): -1. Extract global descriptor from current aerial image -2. Retrieve top-K candidates (K=5-10) using L2 distance -3. Fine matching using local features (SuperPoint+LightGlue) -4. Homography estimation with RANSAC -5. GPS coordinate calculation -6. Apply correction to trajectory -``` - -#### Bundle Adjustment -COLMAP provides incremental Structure-from-Motion with automatic camera calibration and bundle adjustment, reconstructing 3D structure and camera poses from overlapping images. - -**Strategy:** -- **Local BA**: Every 20 frames (maintain <2s processing time) -- **Global BA**: After every 100 frames or satellite correction -- **Fixed Parameters**: Altitude constraint, camera intrinsics (if known) -- **Optimization**: Ceres Solver with Levenberg-Marquardt - -### 2.3 Meeting Acceptance Criteria - -| Criterion | Implementation Strategy | -|-----------|------------------------| -| 80% within 50m accuracy | VO + Satellite correction every 10-20 frames | -| 60% within 20m accuracy | Fine-tuned cross-view matching + bundle adjustment | -| Handle 350m outliers | RANSAC outlier rejection + distance threshold | -| Handle sharp turns (<5% overlap) | Trigger satellite matching, skip VO | -| <10% satellite outliers | Confidence scoring + verification matches | -| User fallback (20% cases) | Automatic detection + GUI for manual GPS input | -| <2 seconds per image | GPU acceleration, adaptive LightGlue, parallel processing | -| >95% registration rate | Robust feature matching + multiple fallback strategies | -| MRE <1.0 pixels | Iterative bundle adjustment + outlier filtering | - -### 2.4 Technology Stack - -**Core Libraries:** -- **COLMAP**: SfM and bundle adjustment -- **Kornia/PyTorch**: Deep learning feature extraction/matching -- **OpenCV**: Image processing and classical CV -- **NumPy/SciPy**: Numerical computations -- **GDAL**: Geospatial data handling - -**Recommended Hardware:** -- **CPU**: 8+ cores (Intel i7/AMD Ryzen 7) -- **GPU**: NVIDIA RTX 3080 or better (12GB+ VRAM) -- **RAM**: 32GB minimum -- **Storage**: SSD for fast I/O - -## 3. Testing Strategy - -### 3.1 Functional Testing - -#### 3.1.1 Feature Extraction & Matching Tests -**Objective**: Verify robust feature detection and matching - -**Test Cases:** -1. **Varied Illumination** - - Sunny conditions (baseline) - - Overcast conditions - - Shadow-heavy areas - - Different times of day - -2. **Terrain Variations** - - Urban areas (buildings, roads) - - Rural areas (fields, forests) - - Mixed terrain - - Water bodies - -3. **Image Quality** - - FullHD (1920×1080) - - 4K (3840×2160) - - Maximum resolution (6252×4168) - - Simulated motion blur - -**Metrics:** -- Number of keypoints detected per image -- Matching ratio (inliers/total matches) -- Repeatability score -- Processing time per image - -**Tools:** -- Custom Python test suite -- Benchmark datasets (MegaDepth, HPatches) - -#### 3.1.2 Visual Odometry Tests -**Objective**: Validate trajectory estimation accuracy - -**Test Cases:** -1. **Normal Flight Path** - - Straight line flight (100m spacing) - - Gradual turns (>20° overlap) - - Consistent altitude - -2. **Challenging Scenarios** - - Sharp turns (trigger satellite matching) - - Variable altitude (if applicable) - - Low-texture areas (fields) - - Repetitive structures (urban grid) - -3. **Outlier Handling** - - Inject 350m displacement - - Non-overlapping consecutive frames - - Verify recovery mechanism - -**Metrics:** -- Relative pose error (rotation and translation) -- Trajectory drift (compared to ground truth) -- Recovery time after outlier -- Scale estimation accuracy - -#### 3.1.3 Cross-View Matching Tests -**Objective**: Ensure accurate satellite registration - -**Test Cases:** -1. **Scale Variations** - - Different altitudes (500m, 750m, 1000m) - - Various GSD (Ground Sample Distance) - -2. **Environmental Changes** - - Temporal differences (satellite data age) - - Seasonal variations - - Construction/development changes - -3. **Geographic Regions** - - Test on multiple locations in Eastern/Southern Ukraine - - Urban vs rural performance - - Different Google Maps update frequencies - -**Metrics:** -- Localization accuracy (meters) -- Retrieval success rate (top-K candidates) -- False positive rate -- Processing time per registration - -#### 3.1.4 Integration Tests -**Objective**: Validate end-to-end pipeline - -**Test Cases:** -1. **Complete Flight Sequences** - - Process 500-image dataset - - Process 1500-image dataset - - Process 3000-image dataset - -2. **User Fallback Mechanism** - - Simulate failure cases - - Test manual GPS input interface - - Verify trajectory continuation - -3. **Sharp Turn Recovery** - - Multiple consecutive sharp turns - - Recovery after extended non-overlap - -**Metrics:** -- Overall GPS accuracy (80% within 50m, 60% within 20m) -- Total processing time -- User intervention frequency -- System stability (memory usage, crashes) - -### 3.2 Non-Functional Testing - -#### 3.2.1 Performance Testing -**Objective**: Meet <2 seconds per image requirement - -**Test Scenarios:** -1. **Processing Speed** - - Measure per-image processing time - - Identify bottlenecks (profiling) - - Test with different hardware configurations - -2. **Scalability** - - 500 images - - 1500 images - - 3000 images - - Monitor memory usage and CPU/GPU utilization - -3. **Optimization** - - GPU vs CPU performance - - Batch processing efficiency - - Parallel processing gains - -**Tools:** -- Python cProfile -- NVIDIA Nsight -- Memory profilers - -**Target Metrics:** -- Average: <1.5 seconds per image -- 95th percentile: <2.0 seconds per image -- Peak memory: <16GB RAM - -#### 3.2.2 Accuracy Testing -**Objective**: Validate GPS accuracy requirements - -**Methodology:** -1. **Ground Truth Collection** - - Use high-accuracy GNSS/RTK measurements - - Collect control points throughout flight path - - Minimum 50 ground truth points per test flight - -2. **Error Analysis** - - Calculate 2D position error for each image - - Generate error distribution histograms - - Identify systematic errors - -3. **Statistical Validation** - - Verify 80% within 50m threshold - - Verify 60% within 20m threshold - - Calculate RMSE, mean, and median errors - -**Test Flights:** -- Minimum 10 different flights -- Various conditions (time of day, terrain) -- Different regions in operational area - -#### 3.2.3 Robustness Testing -**Objective**: Ensure system reliability under adverse conditions - -**Test Cases:** -1. **Image Registration Rate** - - Target: >95% successful registration - - Test with challenging image sequences - - Analyze failure modes - -2. **Mean Reprojection Error** - - Target: <1.0 pixels - - Test bundle adjustment convergence - - Verify 3D point quality - -3. **Outlier Detection** - - Inject various outlier types - - Measure detection rate - - Verify no false negatives (missed outliers) - -4. **Satellite Map Quality** - - Test with outdated satellite imagery - - Regions with limited coverage - - Urban development changes - -#### 3.2.4 Stress Testing -**Objective**: Test system limits and failure modes - -**Scenarios:** -1. **Extreme Conditions** - - Maximum 3000 images - - Highest resolution (6252×4168) - - Extended flight duration - -2. **Resource Constraints** - - Limited GPU memory - - CPU-only processing - - Concurrent processing tasks - -3. **Edge Cases** - - All images in same location (no motion) - - Completely featureless terrain - - Extreme weather effects (if data available) - -### 3.3 Test Data Requirements - -#### 3.3.1 Synthetic Data -**Purpose**: Controlled testing environment - -**Generation:** -- Simulate flights using game engines (Unreal Engine/Unity) -- Generate ground truth poses -- Vary parameters (altitude, speed, terrain) -- Add realistic noise and artifacts - -#### 3.3.2 Real-World Data -**Collection Requirements:** -- 10+ flights with ground truth GPS -- Diverse terrains (urban, rural, mixed) -- Different times of day -- Various weather conditions (within restrictions) -- Coverage across operational area - -**Annotation:** -- Manual verification of GPS coordinates -- Quality ratings for each image -- Terrain type classification -- Known challenging sections - -### 3.4 Continuous Testing Strategy - -#### 3.4.1 Unit Tests -- Feature extraction modules -- Matching algorithms -- Coordinate transformations -- Utility functions -- >80% code coverage target - -#### 3.4.2 Integration Tests -- Component interactions -- Data flow validation -- Error handling -- API consistency - -#### 3.4.3 Regression Tests -- Performance benchmarks -- Accuracy baselines -- Automated on each code change -- Prevent degradation - -#### 3.4.4 Test Automation -**CI/CD Pipeline:** -```yaml -Pipeline: - 1. Code commit - 2. Unit tests (pytest) - 3. Integration tests - 4. Performance benchmarks - 5. Generate test report - 6. Deploy if all pass -``` - -**Tools:** -- pytest for Python testing -- GitHub Actions / GitLab CI -- Docker for environment consistency -- Custom validation scripts - -### 3.5 Test Metrics & Success Criteria - -| Metric | Target | Test Method | -|--------|--------|-------------| -| GPS Accuracy (50m) | 80% | Real flight validation | -| GPS Accuracy (20m) | 60% | Real flight validation | -| Processing Speed | <2s/image | Performance profiling | -| Registration Rate | >95% | Feature matching tests | -| MRE | <1.0 pixels | Bundle adjustment analysis | -| Outlier Detection | >99% | Synthetic outlier injection | -| User Intervention | <20% | Complete flight processing | -| System Uptime | >99% | Stress testing | - -### 3.6 Test Documentation - -**Required Documentation:** -1. **Test Plan**: Comprehensive testing strategy -2. **Test Cases**: Detailed test scenarios and steps -3. **Test Data**: Description and location of datasets -4. **Test Results**: Logs, metrics, and analysis -5. **Bug Reports**: Issue tracking and resolution -6. **Performance Reports**: Benchmarking results -7. **User Acceptance Testing**: Validation with stakeholders - -### 3.7 Best Practices - -1. **Iterative Testing**: Test early and often throughout development -2. **Realistic Data**: Use real flight data as much as possible -3. **Version Control**: Track test data and results -4. **Reproducibility**: Ensure tests can be replicated -5. **Automation**: Automate repetitive tests -6. **Monitoring**: Continuous performance tracking -7. **Feedback Loop**: Incorporate test results into development - -## 4. Implementation Roadmap - -### Phase 1: Core Development (Weeks 1-4) -- Feature extraction pipeline (SuperPoint/LightGlue) -- Visual odometry implementation -- Basic bundle adjustment integration - -### Phase 2: Cross-View Matching (Weeks 5-8) -- Satellite tile download and indexing -- NetVLAD descriptor extraction -- Coarse-to-fine matching pipeline - -### Phase 3: Integration & Optimization (Weeks 9-12) -- End-to-end pipeline integration -- Performance optimization (GPU, parallelization) -- User fallback interface - -### Phase 4: Testing & Validation (Weeks 13-16) -- Comprehensive testing (all test cases) -- Real-world validation flights -- Performance tuning - -### Phase 5: Deployment (Weeks 17-18) -- Documentation -- Deployment setup -- Training materials - -## 5. Risk Mitigation - -| Risk | Mitigation | -|------|------------| -| Google Maps outdated | Multiple satellite sources, manual verification | -| GPU unavailable | CPU fallback with SIFT | -| Sharp turns | Automatic satellite matching trigger | -| Featureless terrain | Reduced keypoint threshold, larger search radius | -| Processing time > 2s | Adaptive LightGlue, parallel processing | -| Poor lighting | Image enhancement preprocessing | - -## 6. References & Resources - -**Key Papers:** -- SuperPoint: Self-Supervised Interest Point Detection and Description (DeTone et al., 2018) -- LightGlue: Local Feature Matching at Light Speed (Lindenberger et al., 2023) -- CVM-Net: Cross-View Matching Network (Hu et al., 2018) -- COLMAP: Structure-from-Motion Revisited (Schönberger et al., 2016) - -**Software & Libraries:** -- COLMAP: https://colmap.github.io/ -- Kornia: https://kornia.readthedocs.io/ -- Hierarchical Localization: https://github.com/cvg/Hierarchical-Localization -- LightGlue: https://github.com/cvg/LightGlue - -This solution provides a robust, scalable approach that meets all acceptance criteria while leveraging state-of-the-art computer vision and deep learning techniques. \ No newline at end of file diff --git a/docs/01_solution/01_solution_draft_google.md b/docs/01_solution/01_solution_draft_google.md deleted file mode 100644 index c1726c7..0000000 --- a/docs/01_solution/01_solution_draft_google.md +++ /dev/null @@ -1,271 +0,0 @@ -# **Analysis and Proposed Architecture for a Hybrid Visual-Geodetic Localization System** - -## **Part 1: Product Solution Description: The GOLS (Geolocational Odometry & Localization System)** - -### **1.1. Executive Summary: System Concept and Mission** - -This report details the technical architecture for the "Geolocational Odometry & Localization System" (GOLS), a high-precision, offline processing software suite designed to solve a complex georeferencing problem. The system's primary mission is to ingest a chronologically-ordered sequence of high-resolution aerial images (e.g., AD000001.jpg to AD001500.jpg) captured from a fixed-wing Unmanned Aerial Vehicle (UAV) and, using *only* the known GPS coordinate of the first image 1, reconstruct the complete and precise geodetic location of the entire flight. - -The system's outputs are twofold: - -1. A high-fidelity, georeferenced 6-Degrees-of-Freedom (6-DoF) pose (comprising Latitude, Longitude, Altitude, Roll, Pitch, and Yaw) for every valid image in the input sequence. -2. An on-demand query function to determine the precise WGS84 GPS coordinates (latitude, longitude, altitude) of any object, identified by its pixel coordinates (u, v), within any of these successfully georeferenced images. - -GOLS is architected as a hybrid, multi-modal system. It is designed to overcome the two fundamental and coupled challenges inherent to this problem: - -1. **Scale Ambiguity:** Monocular Visual Odometry (VO) or Simultaneous Localization and Mapping (SLAM) systems are incapable of determining the true scale of the world from a single camera feed.2 The 100-meter distance between photos is a guideline, but cannot be used as a rigid constraint to solve this ambiguity. -2. **Cumulative Drift:** All relative-positioning systems accumulate small errors over time, causing the estimated trajectory to "drift" from the true geodetic path.4 - -To solve these, the GOLS architecture fuses high-frequency, relative motion estimates (derived from frame-to-frame image analysis) with low-frequency, *absolute* geodetic "anchor points" derived from matching the UAV's imagery against an external satellite map provider (Google Maps).5 - -### **1.2. Core Technical Principle: A Hybrid Fusion Approach** - -The system's core philosophy is founded on the understanding that no single, monolithic algorithm can meet the severe operational constraints and stringent acceptance criteria. The problem as defined sits at the intersection of three challenging computer vision domains. - -* A standard **Visual SLAM** system (e.g., ORB-SLAM3 8) would fail. The target environment (Eastern and Southern Ukraine) and the provided sample images (Images 1-9) are dominated by natural, low-texture terrain such as fields, shrubbery, and dirt roads. Feature-based SLAM systems are notoriously unreliable in such "textureless" areas.10 -* A standard **Structure from Motion (SfM)** pipeline (e.g., COLMAP 13) would also fail. The constraints explicitly state "sharp turns" with less than 5% image-to-image overlap and potential "350m outlier" photos. Traditional SfM approaches require significant image overlap and will fail to register images across these gaps.14 - -Therefore, GOLS is designed as a modular, graph-based optimization framework.15 It separates the problem into three parallel "front-end" modules that generate "constraints" (i.e., measurements) and one "back-end" module that fuses all measurements into a single, globally consistent solution. - -1. **Module 1: Visual Odometry (VO) Front-End:** This module computes high-frequency, *relative* frame-to-frame motion (e.g., "frame 2 is 98.2m forward and 1.8° right of frame 1"). This provides the dense "shape" of the trajectory but is unscaled and prone to drift. -2. **Module 2: Wide-Baseline SfM Front-End:** This module computes low-frequency, *non-sequential relative* matches (e.g., "frame 50 contains the same building as frame 5"). Its sole purpose is to bridge large gaps in the trajectory caused by sharp turns (Acceptance Criterion 4) or sensor outliers (Acceptance Criterion 3). -3. **Module 3: Cross-View Georeferencing (CVG) Front-End:** This module computes low-frequency, *absolute* pose estimates (e.g., "frame 100 is at 48.27° N, 37.38° E, at 1km altitude"). It does this by matching UAV images to the georeferenced satellite map.5 This module provides the *absolute scale* and the *GPS anchors* necessary to eliminate drift and meet the \<20m accuracy criteria. -4. **Module 4: Back-End Global Optimizer:** This module fuses all constraints from the other three modules into a pose graph and solves it, finding the 6-DoF pose for every image that best-satisfies all (often conflicting) measurements. - -### **1.3. Addressing Key Problem Constraints & Acceptance Criteria** - -This hybrid architecture is specifically designed to meet each acceptance criterion: - -* **Criteria 1 & 2 (80% \< 50m, 60% \< 20m error):** Solved by the **CVG Front-End (Module 3)**. By providing sparse, absolute GPS fixes, this module "anchors" the pose graph. The back-end optimization (Module 4) propagates this absolute information across the entire trajectory, correcting the scale and eliminating the cumulative drift.4 -* **Criteria 3 & 4 (350m outlier, \<5% overlap on turns):** Solved by the **Wide-Baseline SfM Front-End (Module 2)**. This module will use state-of-the-art (SOTA) deep-learning-based feature matchers (analyzed in 2.1.2) that are designed for "wide-baseline" or "low-overlap" scenarios.18 These matchers can find correspondences where traditional methods fail, allowing the system to bridge these gaps. -* **Non-Stabilized Camera Constraint:** The fixed, non-stabilized camera on a fixed-wing platform will induce severe roll/pitch and motion blur. This is handled by two components: (1) A specialized VO front-end (Module 1) that is photometrically robust 20, and (2) aggressive RANSAC (RANdom SAmple Consensus) 21 outlier rejection at every matching stage to discard false correspondences caused by motion blur or extreme perspective distortion. -* **Criterion 7 (\< 2s/image):** Solved by a multi-threaded, asynchronous architecture and **GPU acceleration**.22 The performance criterion is interpreted as *average throughput*, not *latency*. The fast VO front-end (Module 1) provides an initial pose, while the computationally expensive Modules 2, 3, and 4 run on a GPU (e.g., using CUDA 24) in parallel, asynchronously refining the global solution. -* **Criteria 8 & 9 (Reg. Rate > 95%, MRE \< 1.0px):** These criteria are achieved by the combination of the front-ends and back-end. The >95% **Registration Rate** 25 is achieved by the *high-recall* front-ends (Module 1 + 2). The \<1.0 pixel **Mean Reprojection Error** 26 is the explicit optimization target of the **Back-End Global Optimizer (Module 4)**, which performs a global Bundle Adjustment (BA) to minimize this exact metric.27 - -## **Part 2: System Architecture and State-of-the-Art Analysis** - -### **2.1. SOTA Foundational Analysis: Selecting the Core Algorithmic Components** - -The GOLS architecture is a composition of SOTA components, each selected to solve a specific part of the problem. - -#### **2.1.1. Front-End Strategy 1 (VO): Feature-Based (ORB-SLAM) vs. Direct (DSO)** - -The primary relative VO front-end (Module 1) must be robust to the UAV's fast motion and the environment's visual characteristics. - -* **Feature-Based Methods (e.g., ORB-SLAM):** Systems like ORB-SLAM3 8 are highly successful general-purpose SLAM systems. They operate by detecting sparse, repeatable features (ORB) and matching them between frames. However, their primary weakness is a reliance on "good features." Research and practical application show that ORB-SLAM suffers from "tracking loss in textureless areas".10 The provided sample images (e.g., Image 1, 2, 8, 9) are dominated by homogeneous fields and sparse vegetation—a worst-case scenario for feature-based methods. -* **Direct Methods (e.g., DSO):** Systems like Direct Sparse Odometry (DSO) 20 or LSD-SLAM 31 operate on a different principle. They "discard the feature extractor and directly utilize the pixel intensity" 32 by minimizing the *photometric error* (difference in pixel brightness) between frames. -* **Analysis and Decision:** For this specific problem, a **Direct** method is superior. - 1. DSO "does not depend on keypoint detectors or descriptors".30 It "can naturally sample pixels from across all image regions that have intensity gradient, including edges or smooth intensity variations".20 - 2. This makes it *ideal* for the target environment. The edge of a dirt track (Image 4), the boundary between two fields (Image 1), or the shadow of a bush (Image 2) provide strong, usable gradients for DSO, whereas they contain no "corners" for ORB. - 3. Furthermore, the non-stabilized camera will cause rapid changes in auto-exposure and vignetting. DSO's formulation "integrates a full photometric calibration, accounting for exposure time, lens vignetting, and non-linear response functions".20 ORB-SLAM's assumption of uniform motion and lighting is more easily violated.34 - -Therefore, the GOLS **Module 1: Relative Odometry Front-End** will be based on **Direct Sparse Odometry (DSO)**. - -#### **2.1.2. Front-End Strategy 2 (Matching): Traditional (SIFT) vs. Deep Learning (SuperGlue)** - -Modules 2 (Wide-Baseline SfM) and 3 (CVG) both rely on a "wide-baseline" feature matcher—an algorithm that can match two images with very different viewpoints. - -* **The Challenge:** The system must match (1) UAV-to-UAV across a "sharp turn" with \<5% overlap (AC 4) and (2) UAV-to-Satellite, which is an *extreme* cross-view matching problem with differences in scale, sensor, illumination, and time (due to outdated maps).35 -* **Traditional Matchers (e.g., SIFT, ORB):** Classic methods like SIFT 36 are robust to scale and rotation, but they fail decisively in cross-view scenarios.37 They produce very few valid matches, which are then almost entirely eliminated by RANSAC filtering.35 -* **Deep Learning Matchers (e.g., SuperGlue):** The SOTA solution is a combination of the **SuperPoint** feature detector 38 and the **SuperGlue** matcher.19 SuperGlue is not a simple matcher; it is a Graph Neural Network (GNN) that performs "context aggregation, matching, and filtering" simultaneously.19 It "establishes global context perception" 38 by learning the geometric priors of 3D-to-2D projection.39 -* **Analysis and Decision:** - 1. For both the wide-baseline UAV-to-UAV case and the extreme cross-view UAV-to-satellite case, **SuperPoint + SuperGlue** is the selected technology. - 2. Multiple studies confirm that SuperPoint+SuperGlue significantly outperforms traditional methods (SIFT/ORB) and other learning-based methods (LoFTR) for UAV-to-map and UAV-to-satellite registration.35 - 3. Its ability to match based on *global context* rather than just local patch appearance makes it robust to the sparse-texture scenes 41 and extreme viewpoint changes that define this problem. - -Therefore, GOLS **Module 2 (Wide-Baseline SfM)** and **Module 3 (CVG)** will be built upon the **SuperPoint + SuperGlue** matching pipeline. - -#### **2.1.3. Back-End Strategy: Monolithic (COLMAP) vs. Hybrid (CVD-SfM) vs. Graph Fusion (GTSAM)** - -The back-end (Module 4) must fuse the *heterogeneous* constraints from Modules 1, 2, and 3\. - -* **Monolithic SfM (e.g., COLMAP):** A classic incremental SfM pipeline.14 As noted, "Traditional SfM pipelines (COLMAP, OpenMVG) struggle with large viewpoint differences, failing to match enough cross-view features".13 This approach will fail. -* **Hybrid SfM (e.g., CVD-SfM):** A very recent SOTA approach (IROS 2025) is CVD-SfM, a "Cross-View Deep Front-end Structure-from-Motion System".14 This system is *designed* to integrate cross-view (e.g., satellite) priors and deep features into a unified SfM pipeline 45 and is shown to achieve higher registration "coverage" than COLMAP on sparse, multi-altitude datasets.13 -* **Graph Fusion (e.g., g2o / GTSAM):** This approach, common in robotics, separates front-end measurement from back-end optimization.46 It uses a library like g2o (General Graph Optimization) 47 or GTSAM (Georgia Tech Smoothing and Mapping) 49 to build a *factor graph*.50 -* **Analysis and Decision:** - 1. While CVD-SfM 44 is highly relevant, it is a monolithic system designed for a specific problem. - 2. Our problem requires fusing *three different types* of constraints: (1) fast, unscaled *photometric* constraints from DSO, (2) sparse, *feature-based relative* constraints from SuperGlue (UAV-to-UAV), and (3) sparse, *feature-based absolute* constraints from SuperGlue (UAV-to-Satellite). - 3. A general-purpose optimization framework is more flexible and robust. **GTSAM** 49 is a SOTA C++ library based on factor graphs, which are explicitly designed for multi-sensor fusion.4 - 4. We can define custom "factors" for each of our three constraint types.53 - 5. Crucially, GTSAM includes the **iSAM2** solver 55, an *incremental* smoothing and mapping algorithm. This allows the back-end graph to be efficiently updated as new (and slow-to-compute) satellite "anchor" constraints arrive from Module 3, without re-solving the entire 3000-image graph from scratch. This incremental nature is essential for meeting the \< 2s/image performance criterion (AC 7). - -Therefore, the GOLS **Module 4: Back-End Global Optimizer** will be implemented using the **GTSAM** factor graph library. - -### **2.2. Proposed GOLS Architecture: A Multi-Front-End, Factor-Graph System** - -The GOLS system is an asynchronous, multi-threaded application built on the four selected modules. - -#### **2.2.1. Module 1: The Relative Odometry Front-End (DSO-VO)** - -* **Purpose:** To generate a high-frequency, low-drift, but *unscaled* estimate of the UAV's trajectory. -* **Algorithm:** Direct Sparse Odometry (DSO).20 -* **Workflow:** - 1. This module runs in a high-priority thread, processing images sequentially (N, N+1). - 2. It minimizes the photometric error between the frames to estimate the relative 6-DoF pose transformation T(N, N+1). - 3. This module runs fastest, providing the initial "dead-reckoning" path. The *scale* of this path is initially unknown (or bootstrapped by Module 3). -* **Output:** A high-frequency stream of Odometry Factors (binary constraints between Pose_N and Pose_N+1) 49 is published to the Back-End (Module 4). - -#### **2.2.2. Module 2: The Wide-Baseline & Outlier Front-End (SG-SfM)** - -* **Purpose:** To find non-sequential "loop closures" or "shortcuts" to correct for drift and re-establish tracking after sharp turns (AC 4) or outliers (AC 3). -* **Algorithm:** SuperPoint 38 + SuperGlue.19 -* **Workflow:** - 1. This module runs asynchronously in a lower-priority GPU thread. - 2. It does *not* compare N to N+1. Instead, it compares Image N to a "sliding window" of non-adjacent frames (e.g., N-50...N-10 and N+10...N+50). - 3. **Handling AC 3 (350m Outlier):** If Image N+1 is an outlier (e.g., a sudden tilt causes a 350m ground shift), Module 1 will fail to match N to N+1. This module, however, will continue to search and will successfully match N to N+2 (or N+3, etc.), *bridging* the outlier. The outlier frame N+1 becomes an un-registered "island" in the pose graph, which is permissible under AC 6 (the 20% allowance). - 4. **Handling AC 4 (Sharp Turn):** Similarly, if a sharp turn breaks Module 1's tracking due to \<5% overlap, this module will find a wide-baseline match, creating a constraint T(N, N+k) that bridges the turn. -* **Output:** A low-frequency stream of Loop Closure Factors (binary constraints between Pose_N and Pose_N+k) 54 is published to the Back-End. - -#### **2.2.3. Module 3: The Absolute Georeferencing Front-End (CVG)** - -* **Purpose:** To provide *absolute scale* and *global GPS coordinates* to anchor the entire graph and eliminate drift, thereby meeting AC 1 and AC 2. -* **Algorithm:** SuperPoint + SuperGlue 42 + RANSAC 21 + Google Maps API. -* **Workflow:** - 1. **Initialization:** This module runs first. It takes AD000001.jpg and its known GPS coordinate.1 It fetches the corresponding satellite tile from Google Maps. It performs a SuperPoint+SuperGlue match.41 From this match, it calculates the 6-DoF pose of the first frame and, most importantly, the initial **scale** (Ground Sampling Distance, GSD, in meters/pixel). This GSD is used to provide an initial scale to the DSO module (Module 1). - 2. **Asynchronous Anchoring:** This module runs in a background GPU thread, activating on a sparse subset of images (e.g., every 25th frame, or when Module 4 reports high pose uncertainty). - 3. For a target Image N, it uses the *current best estimate* of its pose (from the GTSAM graph) to fetch the relevant satellite tile. - 4. It performs a SuperGlue UAV-to-satellite match.42 - 5. **Handling Outdated Maps:** The constraint "Google Maps...could be...outdated" 56 is a major challenge. This architecture is robust to it. Because SuperGlue 38 is a *feature-based* matcher, it matches persistent geometric features (road intersections, building corners 35, field boundaries) that are stable over time. It is not confused by temporal, non-geometric changes (e.g., different seasons, presence/absence of cars, new/destroyed small structures) that would foil a dense or semantic matcher (like CVM-Net 57). - 6. **Outlier Rejection (AC 5):** The match is validated with a robust RANSAC.21 If the number of inlier matches is too low or the reprojection error is too high (indicating a failed or poor match, e.g., due to clouds or extreme map changes), the match is *discarded*.58 This ensures the "Number of outliers during...ground check" is \< 10%. -* **Output:** A low-frequency stream of Absolute Pose Factors (unary, or "GPS" constraints) 54 is published to the Back-End. - -#### **2.2.4. Module 4: The Back-End Global Optimizer (GTSAM-PGO)** - -* **Purpose:** To fuse all constraints from Modules 1, 2, and 3 into a single, globally-consistent 6-DoF trajectory that is scaled and georeferenced. -* **Algorithm:** GTSAM (Georgia Tech Smoothing and Mapping) 49 using an iSAM2 (incremental Smoothing and Mapping) solver.55 -* **Workflow:** - 1. The system maintains a factor graph 51 where *nodes* are the unknown 6-DoF poses of each camera and *edges* (factors) are the constraints (measurements) from the front-ends. - 2. **Factor Types:** - * **Unary Factor:** A high-precision prior on Pose_1 (from the 1 input). - * **Unary Factors:** The sparse, lower-precision Absolute Pose Factors from Module 3\.54 - * **Binary Factors:** The dense, high-precision, but unscaled Odometry Factors from Module 1\.53 - * **Binary Factors:** The sparse, high-precision Loop Closure Factors from Module 2\.54 - 3. **Optimization:** The iSAM2 solver 55 runs continuously, finding the set of 6-DoF poses that *minimizes the error* across all factors simultaneously (a non-linear least-squares problem). This optimization process: - * *Fixes Scale:* Uses the absolute measurements from Module 3 to find the single "scale" parameter that best fits the unscaled DSO measurements from Module 1\. - * *Corrects Drift:* Uses the "loop closures" from Module 2 and "GPS anchors" from Module 3 to correct the cumulative drift from Module 1\. -* **Output:** The final, optimized 6-DoF pose for all registered images. This final optimized structure is designed to meet the **MRE \< 1.0 pixels** criterion (AC 9).28 - -### **2.3. Sub-System: Object-Level Geolocation (Photogrammetric Ray-Casting)** - -The second primary user requirement is to find the GPS coordinates of "any object" (pixel) in an image. This is a standard photogrammetric procedure 59 that is *only* solvable *after* Module 4 has produced a high-fidelity 6-DoF pose for the image. - -#### **2.3.1. Prerequisite 1: Camera Intrinsic Calibration** - -The system must be provided with the camera's *intrinsic parameters* (focal length, principal point (cx, cy), and distortion coefficients k1, k2...). These must be pre-calibrated (e.g., using a checkerboard) and provided as an input file.28 - -#### **2.3.2. Prerequisite 2: Digital Elevation Model (DEM) Acquisition** - -To find where a ray from the camera hits the ground, a 3D model of the ground itself is required. - -* **SOTA Analysis:** Several free, global DEMs are available, including SRTM 60 and Copernicus DEM.60 -* **Decision:** The system will use the **Copernicus GLO-30 DEM**.63 -* **Rationale:** - 1. **Type:** Copernicus is a **Digital Surface Model (DSM)**, meaning it "represents the surface of the Earth including buildings, infrastructure and vegetation".63 This is *critically important*. The sample images (e.g., Image 5, 6, 7) clearly show buildings and trees. If a user clicks on a building rooftop, a DSM will return the correct (high) altitude. A Digital *Terrain* Model (DTM) like SRTM would return the altitude of the "bare earth" *under* the building, which would be incorrect. - 2. **Accuracy:** Copernicus GLO-30 (data 2011-2015) is a more modern and higher-fidelity dataset than the older SRTM (data 2000). It has "minimized data voids" and "improved the vertical accuracy".65 -* **Implementation:** The GOLS system will automatically download the required Copernicus GLO-30 tiles (in GeoTIFF format) for the flight's bounding box from an open-data source (e.g., the Copernicus S3 bucket 60). - -#### **2.3.3. Geolocation via Photogrammetric Ray-Casting** - -* **Algorithm:** This process is known as ray-casting.69 - 1. **Input:** Image AD0000X.jpg, pixel coordinate (u, v). - 2. **Load:** The optimized 6-DoF pose Pose_X (from Module 4) and the camera's intrinsic parameters. - 3. **Load:** The Copernicus GLO-30 DEM, loaded as a 3D mesh (e.g., using rasterio 71). - 4. **Un-project:** Using the camera intrinsics, convert the 2D pixel (u, v) into a 3D ray vector R in the camera's local coordinate system. - 5. **Transform:** Use the 6-DoF pose Pose_X to transform the ray's origin O (the camera's 3D position) and vector R into the global (WGS84) coordinate system. - 6. **Intersect:** Compute the 3D intersection point P(lat, lon, alt) where the global ray (O, R) intersects the 3D mesh of the DEM.72 - 7. **Output:** P(lat, lon, alt) is the precise GPS coordinate of the object at pixel (u, v). -* **Libraries:** This sub-system will be implemented in Python, using the rasterio library 71 for DEM I/O and a library like trimesh 75 or pyembree 73 for high-speed ray-mesh intersection calculations. - -### **2.4. Performance and Usability Architecture** - -#### **2.4.1. Hardware Acceleration (Criterion 7: \< 2s/image)** - -The \< 2s/image criterion is for *average throughput*. A 1500-image flight must complete in \< 3000 seconds (50 minutes). This is aggressive. - -* **Bottlenecks:** The deep learning front-ends (Module 2 & 3: SuperPoint/SuperGlue) 19 and the graph optimization (Module 4) are the bottlenecks. -* **Solution:** These modules *must* be GPU-accelerated.22 - 1. **Module 1 (DSO):** Natively fast, CPU-bound. - 2. **Module 2/3 (SuperGlue):** Natively designed for GPU execution.19 Running SuperGlue on a CPU would take tens of seconds *per match*, catastrophically failing the performance requirement. - 3. **Module 4 (GTSAM):** Can be compiled with CUDA support for GPU-accelerated solver steps. -* **Implementation:** The system will be built on a pub/sub framework (e.g., ROS 2, or a custom C++/Python framework) to manage the asynchronous-threaded architecture. A high-end NVIDIA GPU (e.g., RTX 30-series or 40-series) is a hard requirement for this system. NVIDIA's Isaac ROS suite provides GPU-accelerated VSLAM packages 24 that can serve as a reference for this implementation. - -#### **2.4.2. User-in-the-Loop Failsafe (Criterion 6)** - -The system must handle "absolute incapable" scenarios (e.g., flying over a large, textureless, featureless body of water) where all modules fail for 3+ consecutive images. - -* **Workflow:** - 1. **Monitor:** The Back-End (Module 4) monitors the queue of unregistered frames. - 2. **Trigger:** If 3 consecutive frames (e.g., N, N+1, N+2) fail all front-end checks (DSO tracking lost, SuperGlue-UAV fails, SuperGlue-Sat fails), the system pauses processing. - 3. **GUI Prompt:** A GUI is presented to the user. - * *Left Pane:* Shows the last known *good* image (e.g., N-1) with its estimated position on the satellite map. - * *Right Pane:* Shows the first *failed* image (e.g., N). - * *Map Pane:* An interactive Google Maps interface centered on the last known good location. - 4. **User Action:** The user must manually find a recognizable landmark in Image N (e.g., "that distinct river bend") and click its corresponding location on the map. - 5. **Recovery:** The user's click generates a new, low-precision Absolute Pose Factor. This new "anchor" is inserted into the GTSAM graph 49, which re-optimizes and attempts to re-start the entire processing pipeline from frame N. - -## **Part 3: Testing Strategy and Validation Plan** - -A comprehensive test suite is required to validate every acceptance criterion. The provided coordinates.csv file 1 will serve as the "ground truth" trajectory for validation.78 - -**Core Validation Metrics:** - -* **Absolute Trajectory Error (ATE):** The geodetic (Haversine) distance, in meters, between the system's estimated pose (lat, lon) for an image and the ground truth (lat, lon) from coordinates.csv.34 This is the primary metric for positional accuracy. -* **Mean Reprojection Error (MRE):** The average pixel distance between an observed 2D feature and its corresponding 3D map point re-projected back into the camera.26 This is the primary metric for the internal 3D consistency of the reconstruction.28 -* **Image Registration Rate (IRR):** The percentage of images for which the system successfully computes a 6-DoF pose.25 - -### **Table 3.1: Acceptance Criteria Test Matrix** - -| Criterion ID | Criterion Description | Test Case | Test Metric | Pass/Fail Threshold | -| :---- | :---- | :---- | :---- | :---- | -| **AC-1** | 80% of photos \< 50m error | **TC-1**: Baseline Accuracy | COUNT(ATE \< 50m) / TOTAL_IMAGES | > 0.80 | -| **AC-2** | 60% of photos \< 20m error | **TC-1**: Baseline Accuracy | COUNT(ATE \< 20m) / TOTAL_IMAGES | > 0.60 | -| **AC-3** | Handle 350m outlier | **TC-2**: Outlier Robustness | ATE for post-outlier frames | ATE remains within AC-1/2 spec. | -| **AC-4** | Handle sharp turns (\<5% overlap) | **TC-3**: Low-Overlap Robustness | IRR for frame *after* the turn | > 95% (must register) | -| **AC-5** | Satellite check outliers \< 10% | **TC-4**: Back-End Residual Analysis | COUNT(Bad_Factors) / COUNT(Sat_Factors) | \< 0.10 | -| **AC-6** | User-in-the-Loop Failsafe | **TC-5**: Failsafe Trigger | System state | GUI prompt must appear. | -| **AC-7** | \< 2 seconds for processing one image | **TC-6**: Performance Benchmark | (Total Wall Time) / TOTAL_IMAGES | \< 2.0 seconds | -| **AC-8** | Image Registration Rate > 95% | **TC-1**: Baseline Accuracy | IRR \= (Registered_Images / TOTAL_IMAGES) | > 0.95 | -| **AC-9** | Mean Reprojection Error \< 1.0 pixels | **TC-1**: Baseline Accuracy | MRE from Back-End (Module 4) | \< 1.0 pixels | - -### **3.1. Test Case 1 (TC-1): Baseline Positional Accuracy (Validates AC-1, AC-2, AC-8, AC-9)** - -* **Procedure:** Process the full sample image set (e.g., AD000001...AD000060) and the coordinates.csv ground truth.1 The system is given *only* the GPS for AD000001.jpg. -* **Metrics & Validation:** - 1. The system's output trajectory is aligned with the ground truth trajectory 1 using a Sim(3) transformation (to account for initial scale/orientation alignment). - 2. **(AC-8)** Calculate the Image Registration Rate (IRR). Pass if IRR > 0.95.25 - 3. **(AC-9)** Extract the final MRE from the GTSAM back-end.26 Pass if MRE \< 1.0 pixels.28 - 4. **(AC-1, AC-2)** Calculate the ATE (in meters) for every successfully registered frame. Calculate the 80th and 60th percentiles. Pass if 80th percentile is \< 50.0m and 60th percentile is \< 20.0m. - -### **3.2. Test Case 2 (TC-2): 350m Outlier Robustness (Validates AC-3)** - -* **Procedure:** Create a synthetic dataset. From the sample set, use AD000001...AD000030. Insert a single, unrelated image (e.g., from a different flight, or AD000060) as frame AD000031_outlier. Append the real AD000031...AD000060 (renamed). -* **Validation:** - 1. The system must process the full set. - 2. The IRR (from TC-1) for *valid* frames must remain > 95%. The outlier frame AD000031_outlier should be correctly rejected. - 3. The ATE for frames AD000031 onward must not be significantly degraded and must remain within the AC-1/2 specification. This validates that Module 2 (SG-SfM) successfully "bridged" the outlier by matching AD000030 to AD000031. - -### **3.3. Test Case 3 (TC-3): Sharp Turn / Low Overlap Robustness (Validates AC-4)** - -* **Procedure:** Create a synthetic dataset by removing frames to simulate a sharp turn. Based on the coordinates in 1, the spatial and angular gap between AD000031 and AD000035 will be large, and overlap will be minimal. -* **Validation:** - 1. The system must successfully register frame AD000035. - 2. The ATE for the entire trajectory must remain within AC-1/2 specification. This directly tests the wide-baseline matching of Module 2 (SG-SfM).19 - -### **3.4. Test Case 4 (TC-4): Satellite Ground Check Fidelity (Validates AC-5)** - -* **Procedure:** During the execution of TC-1, log the state of every Absolute Pose Factor generated by Module 3 (CVG). -* **Metric:** An "outlier factor" is defined as a satellite-match constraint whose final, optimized error *residual* in the GTSAM graph is in the top 10% of all residuals. A high residual means the optimizer is "fighting" this constraint, indicating a bad match (e.g., RANSAC failure 58 or a poor-quality match on an outdated map 54). -* **Validation:** Pass if the total count of these outlier factors is \< 10% of the total number of satellite-match attempts. - -### **3.5. Test Case 5 (TC-5): Failsafe Mechanism Trigger (Validates AC-6)** - -* **Procedure:** Create a synthetic dataset. Insert 4 consecutive "bad" frames (e.g., solid black images, solid white images, or images from a different continent) into the middle of the sample set (e.g., at AD000030). -* **Validation:** - 1. The system must detect 3 consecutive registration failures (30, 31, 32). - 2. Pass if the system automatically pauses processing and successfully displays the "User-in-the-Loop" GUI described in section 2.4.2. - -### **3.6. Test Case 6 (TC-6): System Performance (Validates AC-7)** - -* **Procedure:** On the specified target hardware (including required NVIDIA GPU), time the *total wall-clock execution* of TC-1 using the full 1500-image dataset. -* **Metric:** (Total Wall Clock Time in seconds) / (Total Number of Images). -* **Validation:** Pass if the average processing time per image is \< 2.0 seconds. \ No newline at end of file diff --git a/docs/01_solution/01_solution_draft_perplexity.md b/docs/01_solution/01_solution_draft_perplexity.md deleted file mode 100644 index 0af0779..0000000 --- a/docs/01_solution/01_solution_draft_perplexity.md +++ /dev/null @@ -1,1125 +0,0 @@ -# UAV Aerial Image Geolocalization System: Solution Draft - -## Executive Summary - -This document presents a comprehensive solution for determining GPS coordinates of aerial image centers and objects within images captured by fixed-wing UAVs flying at altitudes up to 1km over eastern/southern Ukraine. The system leverages structure-from-motion (SfM), visual odometry, and satellite image cross-referencing to achieve sub-50-meter accuracy for 80% of images while maintaining registration rates above 95%. - ---- - -## 1. Problem Analysis - -### 1.1 Key Constraints & Challenges -- **No onboard GPS/GNSS receiver** (system must infer coordinates) -- **Fixed downward-pointing camera** (non-stabilized, subject to aircraft pitch/roll) -- **Up to 3000 images per flight** at 100m nominal spacing (variable due to aircraft dynamics) -- **Altitude ≤ 1km** with resolution up to 6252×4168 pixels -- **Sharp turns possible** causing image overlaps <5% or complete loss -- **Outliers possible**: 350m drift between consecutive images (aircraft tilt) -- **Time constraint**: <2 seconds processing per image -- **Real-world requirement**: Google Maps validation with <10% outliers - -### 1.2 Reference Dataset Analysis -The provided 29 sample images show: -- **Flight distance**: ~2.26 km ground path -- **Image spacing**: 66-202m (mean 119m), indicating ~100-200m altitude -- **Coverage area**: ~1.1 km × 1.6 km -- **Geographic region**: Eastern Ukraine (east of Dnipro, Kherson/Zaporozhye area) -- **Terrain**: Mix of agricultural fields and scattered vegetation - -### 1.3 Acceptance Criteria Summary -| Criterion | Target | -|-----------|--------| -| 80% of images within 50m error | Required | -| 60% of images within 20m error | Required | -| Handle 350m outlier drift | Graceful degradation | -| Image Registration Rate | >95% | -| Mean Reprojection Error | <1.0 pixels | -| Processing time/image | <2 seconds | -| Outlier rate (satellite check) | <10% | -| User interaction fallback | For unresolvable 20% | - ---- - -## 2. State-of-the-Art Solutions - -### 2.1 Current Industry Standards - -#### **A. OpenDroneMap (ODM)** -- **Strengths**: Open-source, parallelizable, proven at scale (2500+ images) -- **Pipeline**: OpenSfM (feature matching/tracking) → OpenMVS (dense reconstruction) → GDAL (georeferencing) -- **Weaknesses**: Requires GCPs for absolute georeferencing; computational cost (recommends 128GB RAM); doesn't handle GPS-denied scenarios without external anchors -- **Typical accuracy**: Meter-level without GCPs; cm-level with GCPs - -#### **B. COLMAP** -- **Strengths**: Incremental SfM with robust bundle adjustment; excellent reprojection error (typically <0.5px) -- **Application**: Academic gold standard; proven on large multi-view datasets -- **Limitations**: Requires good initial seed pair; can fail with low overlap; computational cost for online processing -- **Relevance**: Core algorithm suitable as backbone for this application - -#### **C. AliceVision/Meshroom** -- **Strengths**: Modular photogrammetry framework; feature-rich; GPU-accelerated -- **Features**: Robust feature matching, multi-view stereo, camera tracking -- **Challenge**: Designed for batch processing, not real-time streaming - -#### **D. ORB-SLAM3** -- **Strengths**: Real-time monocular SLAM; handles rolling-shutter distortions; extremely fast -- **Relevant to**: Aerial video streams; can operate at frame rates -- **Limitation**: No absolute georeferencing without external anchors; drifts over long sequences - -#### **E. GPS-Denied Visual Localization (GNSS-Denied Methods)** -- **Deep Learning Approaches**: CLIP-based satellite-aerial image matching achieving 39m location error, 15.9° heading error at 100m altitude -- **Hierarchical Methods**: Coarse semantic matching + fine-grained feature refinement; tolerates oblique views -- **Advantage**: Works with satellite imagery as reference - -### 2.2 Feature Detector/Descriptor Comparison - -| Algorithm | Detection Speed | Matching Speed | Features | Robustness | Best For | -|-----------|-----------------|-----------------|----------|-----------|----------| -| **SIFT** | Slow | Medium | Scattered | Excellent | Reference, small scale | -| **AKAZE** | Fast | Fast | Moderate | Very Good | Real-time, scale variance | -| **ORB** | Very Fast | Very Fast | High | Good | Real-time, embedded systems | -| **SuperPoint** | Medium | Fast | Learned | Excellent | Modern DL pipelines | - -**Recommendation**: Hybrid approach using AKAZE for speed + SuperPoint for robustness in difficult scenes - ---- - -## 3. Proposed Architecture Solution - -### 3.1 High-Level System Design - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ UAV IMAGE STREAM │ -│ (Sequential, ≤100m spacing, 100-200m alt) │ -└──────────────────────────┬──────────────────────────────────────┘ - │ - ┌──────────────────┴──────────────────┐ - │ │ - ▼ ▼ -┌──────────────────────────┐ ┌──────────────────────────┐ -│ FEATURE EXTRACTION │ │ INITIALIZATION MODULE │ -│ ──────────────────── │ │ ────────────────── │ -│ • AKAZE keypoint detect │ │ • Assume starting GPS │ -│ • Multi-scale pyramids │ │ • Initial camera params │ -│ • Descriptor computation│ │ • Seed pair selection │ -└──────────────┬───────────┘ └──────────────┬───────────┘ - │ │ - │ ┌────────────────────────┘ - │ │ - ▼ ▼ - ┌──────────────────────────┐ - │ SEQUENTIAL MATCHING │ - │ ──────────────────── │ - │ • N-to-N+1 matching │ - │ • Epipolar constraint │ - │ • RANSAC outlier reject │ - │ • Essential matrix est. │ - └──────────────┬───────────┘ - │ - ┌────────┴────────┐ - │ │ - YES ▼ ▼ NO/DIFFICULT - ┌──────────────┐ ┌──────────────┐ - │ COMPUTE POSE │ │ FALLBACK: │ - │ ────────────│ │ • Try N→N+2 │ - │ • 8-pt alg │ │ • Try global │ - │ • Triangulate • Try satellite │ - │ • BA update │ │ • Ask user │ - └──────┬───────┘ └──────┬───────┘ - │ │ - └────────┬────────┘ - │ - ▼ - ┌──────────────────────────────┐ - │ BUNDLE ADJUSTMENT (Local) │ - │ ────────────────────────── │ - │ • Windowed optimization │ - │ • Levenberg-Marquardt │ - │ • Refine poses + 3D points │ - │ • Covariance estimation │ - └──────────────┬───────────────┘ - │ - ▼ - ┌──────────────────────────────┐ - │ GEOREFERENCING │ - │ ──────────────────────── │ - │ • Satellite image matching │ - │ • GCP integration (if avail)│ - │ • WGS84 transformation │ - │ • Accuracy assessment │ - └──────────────┬───────────────┘ - │ - ▼ - ┌──────────────────────────────┐ - │ OUTPUT & VALIDATION │ - │ ──────────────────────── │ - │ • Image center GPS coords │ - │ • Object/feature coords │ - │ • Confidence intervals │ - │ • Outlier flagging │ - │ • Google Maps cross-check │ - └──────────────────────────────┘ -``` - -### 3.2 Core Algorithmic Components - -#### **3.2.1 Initialization Phase** -**Input**: Starting GPS coordinate (or estimated from first visible landmarks) - -**Process**: -1. Load first image, extract AKAZE features at multiple scales -2. Establish camera intrinsic parameters: - - If known: use factory calibration or pre-computed values - - If unknown: assume standard pinhole model with principal point at image center - - Estimate focal length from image resolution: ~2.5-3.0 × image width (typical aerial lens) -3. Define initial local coordinate system: - - Origin at starting GPS coordinate - - Z-axis up, XY horizontal - - Project all future calculations to WGS84 at end - -**Output**: Camera matrix K, initial camera pose (R₀, t₀) - -#### **3.2.2 Sequential Image-to-Image Matching** -**Algorithm**: Incremental SfM with temporal ordering constraint - -``` -For image N in sequence: - 1. Extract AKAZE features from image N - 2. Match features with image N-1 using KNN with Lowe's ratio test - 3. RANSAC with 8-point essential matrix estimation: - - Iterate: sample 8 point correspondences - - Solve: SVD-based essential matrix E computation - - Score: inlier count (epipolar constraint |p'ᵀEp| < ε) - - Keep: best E with >50 inliers - 4. If registration fails (inliers <50 or insufficient quality): - - Attempt N to N+2 matching (skip frame) - - If still failing: request user input or flag as uncertain - 5. Decompose E to camera pose (R, t) with triangulation validation - 6. Triangulate 3D points from matched features - 7. Perform local windowed bundle adjustment (last 5 images) - 8. Compute image center GPS via local-to-global transformation -``` - -**Key Parameters**: -- AKAZE threshold: adaptive based on image quality -- Matching distance ratio: 0.7 (Lowe's test) -- RANSAC inlier threshold: 1.0 pixels -- Minimum inliers for success: 50 points -- Maximum reprojection error in BA: 1.5 pixels - -#### **3.2.3 Pose Estimation & Triangulation** -**5-Point Algorithm** (Stewenius et al.): -- Minimal solver for 5 point correspondences -- Returns up to 4 solutions for essential matrix -- Selects solution with maximum triangulated points in front of cameras -- Complexity: O(5) vs O(8) for 8-point, enabling RANSAC speed - -**Triangulation**: -- Linear triangulation using DLT (Direct Linear Transform) -- For each matched feature pair: solve 4×4 system via SVD -- Filter: reject points with: - - Reprojection error > 1.5 pixels - - Behind either camera - - Altitude inconsistent with flight dynamics - -#### **3.2.4 Bundle Adjustment (Windowed)** -**Formulation**: -``` -minimize Σ ||p_i^(img) - π(X_i, P_cam)||² + λ·||ΔP_cam||² - -where: -- p_i^(img): observed pixel position -- X_i: 3D point coordinate -- P_cam: camera pose parameters -- π(): projection function -- λ: regularization weight -``` - -**Algorithm**: Sparse Levenberg-Marquardt with Schur complement -- Window size: 5-10 consecutive images (trade-off between accuracy and speed) -- Iteration limit: 10 (convergence typically in 3-5) -- Damping: adaptive μ (starts at 10⁻⁶) -- Covariance computation: from information matrix inverse - -**Complexity**: O(w³) where w = window size → ~0.3s for w=10 on modern CPU - -#### **3.2.5 Georeferencing Module** -**Challenge**: Converting local 3D structure to WGS84 coordinates - -**Approach 1 - Satellite Image Matching** (Primary): -1. Query Google Maps Static API for area around estimated location -2. Scale downloaded satellite imagery to match expected ground resolution -3. Extract ORB/SIFT features from satellite image -4. Match features between UAV nadir image and satellite image -5. Compute homography transformation (if sufficient overlap) -6. Estimate camera center GPS from homography -7. Validate: check consistency with neighboring images - -**Approach 2 - GCP Integration** (When available): -1. If user provides 4+ manually-identified GCPs in images with known coords: - - Use GCPs to establish local-to-global transformation - - 6-DOF rigid transformation (4 GCPs minimum) - - Refine with all available GCPs using least-squares -2. Transform all local coordinates via this transformation - -**Approach 3 - IMU/INS Integration** (If available): -1. If UAV provides gyro/accelerometer data: - - Integrate IMU measurements to constrain camera orientation - - Use IMU to detect anomalies (sharp turns, tilt) - - Fuse with visual odometry using Extended Kalman Filter (EKF) - - Improves robustness during low-texture sequences - -**Uncertainty Quantification**: -- Covariance matrix σ² from bundle adjustment -- Project uncertainty to GPS coordinates via Jacobian -- Compute 95% confidence ellipse for each image center -- Typical values: σ ≈ 20-50m initially, improves with satellite anchor - -#### **3.2.6 Fallback & Outlier Detection** -**Outlier Detection Strategy**: -1. **Local consistency check**: - - Compute velocity between consecutive images - - Flag if velocity changes >50% between successive intervals - - Expected velocity: ~10-15 m/s ground speed -2. **Satellite validation**: - - After full flight processing: retrieve satellite imagery - - Compare UAV image against satellite image at claimed coordinates - - Compute cross-correlation; flag if <0.3 -3. **Loop closure detection**: - - If imagery from later in flight matches earlier imagery: flag potential error - - Use place recognition (ORB vocabulary tree) to detect revisits -4. **User feedback loop**: - - Display flagged uncertain frames to operator - - Allow manual refinement for <20% of images - - Re-optimize trajectory using corrected anchor points - -**Graceful Degradation** (350m outlier scenario): -- Detect outlier via velocity threshold -- Attempt skip-frame matching (N to N+2, N+3) -- If fails, insert "uncertainty zone" marker -- Continue from next successfully matched pair -- Later satellite validation will flag this region for manual review - ---- - -## 4. Architecture: Detailed Module Specifications - -### 4.1 System Components - -#### **Component 1: Image Preprocessor** -``` -Input: Raw JPEG/PNG from UAV -Output: Normalized, undistorted image ready for feature extraction - -Operations: -├─ Load image (max 6252×4168) -├─ Apply lens distortion correction (if calibration available) -├─ Normalize histogram (CLAHE for uniform feature detection) -├─ Optional: Downsample for <2s latency (e.g., 3000×2000 if >4000×3000) -├─ Compute image metadata (filename, timestamp) -└─ Cache for access by subsequent modules -``` - -#### **Component 2: Feature Detector** -``` -Input: Preprocessed image -Output: Keypoints + descriptors - -Algorithm: AKAZE with multi-scale pyramids -├─ Pyramid levels: 4-6 (scale factor 1.2) -├─ FAST corner threshold: adaptive (target 500-1000 keypoints) -├─ BRIEF descriptor: rotation-aware, 256 bits -├─ Feature filtering: -│ ├─ Remove features in low-texture regions (variance <10) -│ ├─ Enforce min separation (8px) to avoid clustering -│ └─ Sort by keypoint strength (use top 2000) -└─ Output: vector, Mat descriptors (Nx256 uint8) -``` - -#### **Component 3: Feature Matcher** -``` -Input: Features from Image N-1, Features from Image N -Output: Vector of matched point pairs (inliers only) - -Algorithm: KNN matching with Lowe's ratio test + RANSAC -├─ BruteForceMatcher (Hamming distance for AKAZE) -├─ KNN search: k=2 -├─ Lowe's ratio test: d1/d2 < 0.7 -├─ RANSAC 5-point algorithm: -│ ├─ Iterations: min(4000, 10000 - 100*inlier_count) -│ ├─ Inlier threshold: 1.0 pixels -│ ├─ Minimum inliers: 50 (lower to 30 for skip-frame matching) -│ └─ Success: inlier_ratio > 0.4 -├─ Triangulation validation (reject behind camera) -└─ Output: vector, Mat points3D (Mx3) -``` - -#### **Component 4: Pose Solver** -``` -Input: Essential matrix E from RANSAC, matched points -Output: Rotation matrix R, translation vector t - -Algorithm: E decomposition -├─ SVD decomposition of E -├─ Extract 4 candidate (R, t) pairs -├─ Triangulate points for each candidate -├─ Select candidate with max points in front of both cameras -├─ Recover scale using calibration (altitude constraint) -├─ Output: 4x4 transformation matrix T = [R t; 0 1] -``` - -#### **Component 5: Triangulator** -``` -Input: Keypoints from image 1, image 2; poses P1, P2; calib K -Output: 3D point positions, mask of valid points - -Algorithm: Linear triangulation (DLT) -├─ For each point correspondence (p1, p2): -│ ├─ Build 4×4 matrix from epipolar lines -│ ├─ SVD → solve for 3D point X -│ ├─ Validate: |p1 - π(X,P1)| < 1.5px AND |p2 - π(X,P2)| < 1.5px -│ ├─ Validate: X_z > 50m (min safe altitude above ground) -│ └─ Validate: X_z < 1500m (max altitude constraint) -└─ Output: Mat points3D (Mx3 float32), Mat validMask (Mx1 uchar) -``` - -#### **Component 6: Bundle Adjuster** -``` -Input: Poses [P0...Pn], 3D points [X0...Xm], observations -Output: Refined poses, 3D points, covariance matrices - -Algorithm: Sparse Levenberg-Marquardt with windowing -├─ Window size: 5 images (or fewer at flight start) -├─ Optimization variables: -│ ├─ Camera poses: 6 DOF per image (Rodrigues rotation + translation) -│ └─ 3D points: 3 coordinates per point -├─ Residuals: reprojection error in both images -├─ Iterations: max 10 (typically converges in 3-5) -├─ Covariance: -│ ├─ Compute Hessian inverse (information matrix) -│ ├─ Extract diagonal for per-parameter variances -│ └─ Per-image uncertainty: sqrt(diag(Cov[t])) -└─ Output: refined poses, points, Mat covariance (per image) -``` - -#### **Component 7: Satellite Georeferencer** -``` -Input: Current image, estimated center GPS (rough), local trajectory -Output: Refined GPS coordinates, confidence score - -Algorithm: Satellite image matching -├─ Query Google Maps API: -│ ├─ Coordinates: estimated_gps ± 200m -│ ├─ Resolution: match UAV image resolution (1-2m GSD) -│ └─ Zoom level: 18-20 -├─ Image preprocessing: -│ ├─ Scale satellite image to ~same resolution as UAV image -│ ├─ Convert to grayscale -│ └─ Equalize histogram -├─ Feature matching: -│ ├─ Extract ORB features from both images -│ ├─ Match with BruteForceMatcher -│ ├─ Apply RANSAC homography (min 10 inliers) -│ └─ Compute inlier ratio -├─ Homography analysis: -│ ├─ If inlier_ratio > 0.2: -│ │ ├─ Extract 4 corners from UAV image via inverse homography -│ │ ├─ Map to satellite image coordinates -│ │ ├─ Compute implied GPS shift -│ │ └─ Apply shift to current pose estimate -│ └─ else: keep local estimate, flag as uncertain -├─ Confidence scoring: -│ ├─ score = inlier_ratio × mutual_information_normalized -│ └─ Threshold: score > 0.3 for "high confidence" -└─ Output: refined_gps, confidence (0.0-1.0), residual_px -``` - -#### **Component 8: Outlier Detector** -``` -Input: Trajectory sequence [GPS_0, GPS_1, ..., GPS_n] -Output: Outlier flags, re-processed trajectory - -Algorithm: Multi-stage detection -├─ Stage 1 - Velocity anomaly: -│ ├─ Compute inter-image distances: d_i = |GPS_i - GPS_{i-1}| -│ ├─ Compute velocity: v_i = d_i / Δt (Δt typically 0.5-2s) -│ ├─ Expected: 10-20 m/s for typical UAV -│ ├─ Flag if: v_i > 30 m/s OR v_i < 1 m/s -│ └─ Acceleration anomaly: |v_i - v_{i-1}| > 15 m/s -├─ Stage 2 - Satellite consistency: -│ ├─ For each flagged image: -│ │ ├─ Retrieve satellite image at claimed GPS -│ │ ├─ Compute cross-correlation with UAV image -│ │ └─ If corr < 0.25: mark as outlier -│ └─ Reprocess outlier image: -│ ├─ Try skip-frame matching (to N±2, N±3) -│ ├─ Try global place recognition -│ └─ Request user input if all fail -├─ Stage 3 - Loop closure: -│ ├─ Check if image matches any earlier image (Hamming dist <50) -│ └─ If match detected: assess if consistent with trajectory -└─ Output: flags, corrected_trajectory, uncertain_regions -``` - -#### **Component 9: User Interface Module** -``` -Input: Flight trajectory, flagged uncertain regions -Output: User corrections, refined trajectory - -Features: -├─ Web interface or desktop app -├─ Map display (Google Maps embedded): -│ ├─ Show computed trajectory -│ ├─ Overlay satellite imagery -│ ├─ Highlight uncertain regions (red) -│ ├─ Show confidence intervals (error ellipses) -│ └─ Display reprojection errors -├─ Image preview: -│ ├─ Click trajectory point to view corresponding image -│ ├─ Show matched keypoints and epipolar lines -│ ├─ Display feature matching quality metrics -│ └─ Show neighboring images in sequence -├─ Manual correction: -│ ├─ Drag trajectory point to correct location (via map click) -│ ├─ Mark GCPs manually (click point in image, enter GPS) -│ ├─ Re-run optimization with corrected anchors -│ └─ Export corrected trajectory as GeoJSON/CSV -└─ Reporting: - ├─ Summary statistics (% within 50m, 20m, etc.) - ├─ Outlier report with reasons - ├─ Satellite validation results - └─ Export georeferenced image list with coordinates -``` - -### 4.2 Data Flow & Processing Pipeline - -**Phase 1: Offline Initialization** (before flight or post-download) -``` -Input: Full set of N images, starting GPS coordinate -├─ Load all images into memory/fast storage (SSD) -├─ Detect features in all images (parallelizable: N CPU threads) -├─ Store features on disk for quick access -└─ Estimate camera calibration (if not known) -Time: ~1-3 minutes for 1000 images on 16-core CPU -``` - -**Phase 2: Sequential Processing** (online or batch) -``` -For i = 1 to N-1: -├─ Load images[i] and images[i+1] -├─ Match features -├─ RANSAC pose estimation -├─ Triangulate 3D points -├─ Local bundle adjustment (last 5 frames) -├─ Satellite georeferencing -├─ Store: GPS[i+1], confidence[i+1], covariance[i+1] -└─ [< 2 seconds per iteration] -Time: 2N seconds = ~30-60 minutes for 1000 images -``` - -**Phase 3: Post-Processing** (after full trajectory) -``` -├─ Global bundle adjustment (optional: full flight with key-frame selection) -├─ Loop closure optimization (if detected) -├─ Outlier detection and flagging -├─ Satellite validation (batch retrieve imagery, compare) -├─ Export results with metadata -└─ Generate report with accuracy metrics -Time: ~5-20 minutes -``` - -**Phase 4: Manual Review & Correction** (if needed) -``` -├─ User reviews flagged uncertain regions -├─ Manually corrects up to 20% of trajectory as needed -├─ Re-optimizes with corrected anchors -└─ Final export -Time: 10-60 minutes depending on complexity -``` - ---- - -## 5. Testing Strategy -## 2. Detailed Test Categories - -### 2.1 Unit Tests (Level 1) - -#### UT-1: Feature Extraction (AKAZE) -``` -Purpose: Verify keypoint detection and descriptor computation -Test Data: Synthetic images with known features (checkerboard patterns) -Test Cases: - ├─ UT-1.1: Basic feature detection - │ Input: 1024×768 synthetic image with checkerboard - │ Expected: ≥500 keypoints detected - │ Pass: count ≥ 500 - │ - ├─ UT-1.2: Scale invariance - │ Input: Same scene at 2x scale - │ Expected: Keypoints at proportional positions - │ Pass: correlation of positions > 0.9 - │ - ├─ UT-1.3: Rotation robustness - │ Input: Image rotated ±30° - │ Expected: Descriptors match original + rotated - │ Pass: match rate > 80% - │ - ├─ UT-1.4: Multi-scale handling - │ Input: Image with features at multiple scales - │ Expected: Features detected at all scales (pyramid) - │ Pass: ratio of scales [1:1.2:1.44:...] verified - │ - └─ UT-1.5: Performance constraint - Input: FullHD image (1920×1080) - Expected: <500ms feature extraction - Pass: 95th percentile < 500ms -``` - -#### UT-2: Feature Matching -``` -Purpose: Verify robust feature correspondence -Test Data: Pairs of synthetic/real images with known correspondence -Test Cases: - ├─ UT-2.1: Basic matching - │ Input: Two images from synthetic scene (90% overlap) - │ Expected: ≥95% of ground-truth features matched - │ Pass: match_rate ≥ 0.95 - │ - ├─ UT-2.2: Outlier rejection (Lowe's ratio test) - │ Input: Synthetic pair + 50% false features - │ Expected: False matches rejected - │ Pass: false_match_rate < 0.1 - │ - ├─ UT-2.3: Low overlap scenario - │ Input: Two images with 20% overlap - │ Expected: Still matches ≥20 points - │ Pass: min_matches ≥ 20 - │ - └─ UT-2.4: Performance - Input: FullHD images, 1000 features each - Expected: <300ms matching time - Pass: 95th percentile < 300ms -``` - -#### UT-3: Essential Matrix Estimation -``` -Purpose: Verify 5-point/8-point algorithms for camera geometry -Test Data: Synthetic correspondences with known relative pose -Test Cases: - ├─ UT-3.1: 8-point algorithm - │ Input: 8+ point correspondences - │ Expected: Essential matrix E with rank 2 - │ Pass: min_singular_value(E) < 1e-6 - │ - ├─ UT-3.2: 5-point algorithm - │ Input: 5 point correspondences - │ Expected: Up to 4 solutions generated - │ Pass: num_solutions ∈ [1, 4] - │ - ├─ UT-3.3: RANSAC convergence - │ Input: 100 correspondences, 30% outliers - │ Expected: Essential matrix recovery despite outliers - │ Pass: inlier_ratio ≥ 0.6 - │ - └─ UT-3.4: Chirality constraint - Input: Multiple (R,t) solutions from decomposition - Expected: Only solution with points in front of cameras selected - Pass: selected_solution verified via triangulation -``` - -#### UT-4: Triangulation (DLT) -``` -Purpose: Verify 3D point reconstruction from image correspondences -Test Data: Synthetic scenes with known 3D geometry -Test Cases: - ├─ UT-4.1: Accuracy - │ Input: Noise-free point correspondences - │ Expected: Reconstructed X matches ground truth - │ Pass: RMSE < 0.1cm on 1m scene - │ - ├─ UT-4.2: Outlier handling - │ Input: 10 valid + 2 invalid correspondences - │ Expected: Invalid points detected (behind camera/far) - │ Pass: valid_mask accuracy > 95% - │ - ├─ UT-4.3: Altitude constraint - │ Input: Points with z < 50m (below aircraft) - │ Expected: Points rejected - │ Pass: altitude_filter works correctly - │ - └─ UT-4.4: Batch performance - Input: 500 point triangulations - Expected: <100ms total - Pass: 95th percentile < 100ms -``` - -#### UT-5: Bundle Adjustment -``` -Purpose: Verify pose and 3D point optimization -Test Data: Synthetic multi-view scenes -Test Cases: - ├─ UT-5.1: Convergence - │ Input: 5 frames with noisy initial poses - │ Expected: Residual decreases monotonically - │ Pass: final_residual < 0.001 * initial_residual - │ - ├─ UT-5.2: Covariance computation - │ Input: Optimized poses and points - │ Expected: Covariance matrix positive-definite - │ Pass: all_eigenvalues > 0 - │ - ├─ UT-5.3: Window size effect - │ Input: Same problem with window sizes [3, 5, 10] - │ Expected: Larger windows → better residuals - │ Pass: residual_5 < residual_3, residual_10 < residual_5 - │ - └─ UT-5.4: Performance scaling - Input: Window size [5, 10, 15, 20] - Expected: Time ~= O(w^3) - Pass: quadratic fit accurate (R² > 0.95) -``` - ---- - -### 2.2 Integration Tests (Level 2) - -#### IT-1: Sequential Pipeline -``` -Purpose: Verify image-to-image processing chain -Test Data: Real aerial image sequences (5-20 images) -Test Cases: - ├─ IT-1.1: Feature flow - │ Features extracted from img₁ → tracked to img₂ → matched - │ Expected: Consistent tracking across images - │ Pass: ≥70% features tracked end-to-end - │ - ├─ IT-1.2: Pose chain consistency - │ Poses P₁, P₂, P₃ computed sequentially - │ Expected: P₃ = P₂ ∘ P₂₋₁ (composition consistency) - │ Pass: pose_error < 0.1° rotation, 5cm translation - │ - ├─ IT-1.3: Trajectory smoothness - │ Velocity computed between poses - │ Expected: Smooth velocity profile (no jumps) - │ Pass: velocity_std_dev < 20% mean_velocity - │ - └─ IT-1.4: Memory usage - Process 100-image sequence - Expected: Constant memory (windowed processing) - Pass: peak_memory < 2GB -``` - -#### IT-2: Satellite Georeferencing -``` -Purpose: Verify local-to-global coordinate transformation -Test Data: Synthetic/real images with known satellite reference -Test Cases: - ├─ IT-2.1: Feature matching with satellite - │ Input: Aerial image + satellite reference - │ Expected: ≥10 matched features between viewpoints - │ Pass: match_count ≥ 10 - │ - ├─ IT-2.2: Homography estimation - │ Matched features → homography matrix - │ Expected: Valid transformation (3×3 matrix) - │ Pass: det(H) ≠ 0, condition_number < 100 - │ - ├─ IT-2.3: GPS transformation accuracy - │ Apply homography to image corners - │ Expected: Computed GPS ≈ known reference GPS - │ Pass: error < 100m (on test data) - │ - └─ IT-2.4: Confidence scoring - Compute inlier_ratio and MI (mutual information) - Expected: score = inlier_ratio × MI ∈ [0, 1] - Pass: high_confidence for obvious matches -``` - -#### IT-3: Outlier Detection Chain -``` -Purpose: Verify multi-stage outlier detection -Test Data: Synthetic trajectory with injected outliers -Test Cases: - ├─ IT-3.1: Velocity anomaly detection - │ Inject 350m jump at frame N - │ Expected: Detected as outlier - │ Pass: outlier_flag = True - │ - ├─ IT-3.2: Recovery mechanism - │ After outlier detection - │ Expected: System attempts skip-frame matching (N→N+2) - │ Pass: recovery_successful = True - │ - ├─ IT-3.3: False positive rate - │ Normal sequence with small perturbations - │ Expected: <5% false outlier flagging - │ Pass: false_positive_rate < 0.05 - │ - └─ IT-3.4: Consistency across stages - Multiple detection stages should agree - Pass: agreement_score > 0.8 -``` - ---- - -### 2.3 System Tests (Level 3) - -#### ST-1: Accuracy Criteria -``` -Purpose: Verify system meets ±50m and ±20m accuracy targets -Test Data: Real aerial image sequences with ground-truth GPS -Test Cases: - ├─ ST-1.1: 50m accuracy target - │ Input: 500-image flight - │ Compute: % images within 50m of ground truth - │ Expected: ≥80% - │ Pass: accuracy_50m ≥ 0.80 - │ - ├─ ST-1.2: 20m accuracy target - │ Same flight data - │ Expected: ≥60% within 20m - │ Pass: accuracy_20m ≥ 0.60 - │ - ├─ ST-1.3: Mean absolute error - │ Compute: MAE over all images - │ Expected: <40m typical - │ Pass: MAE < 50m - │ - └─ ST-1.4: Error distribution - Expected: Error approximately Gaussian - Pass: K-S test p-value > 0.05 -``` - -#### ST-2: Registration Rate -``` -Purpose: Verify ≥95% of images successfully registered -Test Data: Real flights with various conditions -Test Cases: - ├─ ST-2.1: Baseline registration - │ Good overlap, clear features - │ Expected: >98% registration rate - │ Pass: registration_rate ≥ 0.98 - │ - ├─ ST-2.2: Challenging conditions - │ Low texture, variable lighting - │ Expected: ≥95% registration rate - │ Pass: registration_rate ≥ 0.95 - │ - ├─ ST-2.3: Sharp turns scenario - │ Images with <10% overlap - │ Expected: Fallback mechanisms trigger, ≥90% success - │ Pass: fallback_success_rate ≥ 0.90 - │ - └─ ST-2.4: Consecutive failures - Track max consecutive unregistered images - Expected: <3 consecutive failures - Pass: max_consecutive_failures ≤ 3 -``` - -#### ST-3: Reprojection Error -``` -Purpose: Verify <1.0 pixel mean reprojection error -Test Data: Real flight data after bundle adjustment -Test Cases: - ├─ ST-3.1: Mean reprojection error - │ After BA optimization - │ Expected: <1.0 pixel - │ Pass: mean_reproj_error < 1.0 - │ - ├─ ST-3.2: Error distribution - │ Histogram of per-point errors - │ Expected: Tightly concentrated <2 pixels - │ Pass: 95th_percentile < 2.0 px - │ - ├─ ST-3.3: Per-frame consistency - │ Error should not vary dramatically - │ Expected: Consistent across frames - │ Pass: frame_error_std_dev < 0.3 px - │ - └─ ST-3.4: Outlier points - Very large reprojection errors - Expected: <1% of points with error >3 px - Pass: outlier_rate < 0.01 -``` - -#### ST-4: Processing Speed -``` -Purpose: Verify <2 seconds per image -Test Data: Full flight sequences on target hardware -Test Cases: - ├─ ST-4.1: Average latency - │ Mean processing time per image - │ Expected: <2 seconds - │ Pass: mean_latency < 2.0 sec - │ - ├─ ST-4.2: 95th percentile latency - │ Worst-case images (complex scenes) - │ Expected: <2.5 seconds - │ Pass: p95_latency < 2.5 sec - │ - ├─ ST-4.3: Component breakdown - │ Feature extraction: <0.5s - │ Matching: <0.3s - │ RANSAC: <0.2s - │ BA: <0.8s - │ Satellite: <0.3s - │ Pass: Each component within budget - │ - └─ ST-4.4: Scaling with problem size - Memory usage, CPU usage vs. image resolution - Expected: Linear scaling - Pass: O(n) complexity verified -``` - -#### ST-5: Robustness - Outlier Handling -``` -Purpose: Verify graceful handling of 350m outlier drifts -Test Data: Synthetic/real data with injected outliers -Test Cases: - ├─ ST-5.1: Single 350m outlier - │ Inject outlier at frame N - │ Expected: Detected, trajectory continues - │ Pass: system_continues = True - │ - ├─ ST-5.2: Multiple outliers - │ 3-5 outliers scattered in sequence - │ Expected: All detected, recovery attempted - │ Pass: detection_rate ≥ 0.8 - │ - ├─ ST-5.3: False positive rate - │ Normal trajectory, no outliers - │ Expected: <5% false flagging - │ Pass: false_positive_rate < 0.05 - │ - └─ ST-5.4: Recovery latency - Time to recover after outlier - Expected: ≤3 frames - Pass: recovery_latency ≤ 3 frames -``` - -#### ST-6: Robustness - Sharp Turns -``` -Purpose: Verify handling of <5% image overlap scenarios -Test Data: Synthetic sequences with sharp angles -Test Cases: - ├─ ST-6.1: 5% overlap matching - │ Two images with 5% overlap - │ Expected: Minimal matches or skip-frame - │ Pass: system_handles_gracefully = True - │ - ├─ ST-6.2: Skip-frame fallback - │ Direct N→N+1 fails, tries N→N+2 - │ Expected: Succeeds with N→N+2 - │ Pass: skip_frame_success_rate ≥ 0.8 - │ - ├─ ST-6.3: 90° turn handling - │ Images at near-orthogonal angles - │ Expected: Degeneracy detected, logged - │ Pass: degeneracy_detection = True - │ - └─ ST-6.4: Trajectory consistency - Consecutive turns: check velocity smoothness - Expected: No velocity jumps > 50% - Pass: velocity_consistency verified -``` - ---- - -### 2.4 Field Acceptance Tests (Level 4) - -#### FAT-1: Real UAV Flight Trial #1 (Baseline) -``` -Scenario: Nominal flight over agricultural field -┌────────────────────────────────────────┐ -│ Conditions: │ -│ • Clear weather, good sunlight │ -│ • Flat terrain, sparse trees │ -│ • 300m altitude, 50m/s speed │ -│ • 800 images, ~15 min flight │ -└────────────────────────────────────────┘ - -Pass Criteria: - ✓ Accuracy: ≥80% within 50m - ✓ Accuracy: ≥60% within 20m - ✓ Registration rate: ≥95% - ✓ Processing time: <2s/image - ✓ Satellite validation: <10% outliers - ✓ Reprojection error: <1.0px mean - -Success Metrics: - • MAE (mean absolute error): <40m - • RMS error: <45m - • Max error: <200m - • Trajectory coherence: smooth (no jumps) -``` - -#### FAT-2: Real UAV Flight Trial #2 (Challenging) -``` -Scenario: Flight with more complex terrain -┌────────────────────────────────────────┐ -│ Conditions: │ -│ • Mixed urban/agricultural │ -│ • Buildings, vegetation, water bodies │ -│ • Variable altitude (250-400m) │ -│ • Includes 1-2 sharp turns │ -│ • 1200 images, ~25 min flight │ -└────────────────────────────────────────┘ - -Pass Criteria: - ✓ Accuracy: ≥75% within 50m (relaxed from 80%) - ✓ Accuracy: ≥50% within 20m (relaxed from 60%) - ✓ Registration rate: ≥92% (relaxed from 95%) - ✓ Processing time: <2.5s/image avg - ✓ Outliers detected: <15% (relaxed from 10%) - -Fallback Validation: - ✓ User corrected <20% of uncertain images - ✓ After correction, accuracy meets FAT-1 targets -``` - -#### FAT-3: Real UAV Flight Trial #3 (Edge Case) -``` -Scenario: Low-texture flight (challenging for features) -┌────────────────────────────────────────┐ -│ Conditions: │ -│ • Sandy/desert terrain or water │ -│ • Minimal features │ -│ • Overcast/variable lighting │ -│ • 500-600 images, ~12 min flight │ -└────────────────────────────────────────┘ - -Pass Criteria: - ✓ System continues (no crash): YES - ✓ Graceful degradation: Flags uncertainty - ✓ User can correct and improve: YES - ✓ Satellite anchor helps recovery: YES - -Success Metrics: - • >80% of images tagged "uncertain" - • After user correction: meets standard targets - • Demonstrates fallback mechanisms working -``` - ---- - -## 3. Test Environment Setup - -### Hardware Requirements -``` -CPU: 16+ cores (Intel Xeon / AMD Ryzen) -RAM: 64GB minimum (32GB acceptable for <1500 images) -Storage: 1TB SSD (for raw images + processing) -GPU: Optional (CUDA 11.8+ for 5-10x acceleration) -Network: For satellite API queries (can be cached) -``` - -### Software Requirements -``` -OS: Ubuntu 20.04 LTS or macOS 12+ -Build: CMake 3.20+, GCC 9+ or Clang 11+ -Dependencies: OpenCV 4.8+, Eigen 3.4+, GDAL 3.0+ -Testing: GoogleTest, Pytest -CI/CD: GitHub Actions or Jenkins -``` - -### Test Data Management -``` -Synthetic Data: Generated via Blender (checked into repo) -Real Data: External dataset storage (S3/local SSD) -Ground Truth: Maintained in CSV format with metadata -Versioning: Git-LFS for binary image data -``` - ---- - -## 4. Test Execution Plan - -### Phase 1: Unit Testing (Weeks 1-6) -``` -Sprint 1-2: UT-1 (Feature detection) - 2 week -Sprint 3-4: UT-2 (Feature matching) - 2 weeks -Sprint 5-6: UT-3, UT-4, UT-5 (Geometry) - 2 weeks - -Continuous: Run full unit test suite every commit -Coverage target: >90% code coverage -``` - -### Phase 2: Integration Testing (Weeks 7-12) -``` -Sprint 7-9: IT-1 (Sequential pipeline) - 3 weeks -Sprint 10-11: IT-2, IT-3 (Georef, Outliers) - 2 weeks -Sprint 12: System integration - 1 week - -Continuous: Integration tests run nightly -``` - -### Phase 3: System Testing (Weeks 13-18) -``` -Sprint 13-14: ST-1, ST-2 (Accuracy, Registration) - 2 weeks -Sprint 15-16: ST-3, ST-4 (Error, Speed) - 2 weeks -Sprint 17-18: ST-5, ST-6 (Robustness) - 2 weeks - -Load testing: 1000-3000 image sequences -Stress testing: Edge cases, memory limits -``` - -### Phase 4: Field Acceptance (Weeks 19-30) -``` -Week 19-22: FAT-1 (Baseline trial) - • Coordinate 1-2 baseline flights - • Validate system on real data - • Adjust parameters as needed - -Week 23-26: FAT-2 (Challenging trial) - • More complex scenarios - • Test fallback mechanisms - • Refine user interface - -Week 27-30: FAT-3 (Edge case trial) - • Low-texture scenarios - • Validate robustness - • Final adjustments - -Post-trial: Generate comprehensive report -``` - ---- - -## 5. Acceptance Criteria Summary - -| Criterion | Target | Test | Pass/Fail | -|-----------|--------|------|-----------| -| **Accuracy@50m** | ≥80% | FAT-1 | ≥80% pass | -| **Accuracy@20m** | ≥60% | FAT-1 | ≥60% pass | -| **Registration Rate** | ≥95% | ST-2 | ≥95% pass | -| **Reprojection Error** | <1.0px mean | ST-3 | <1.0px pass | -| **Processing Speed** | <2.0s/image | ST-4 | p95<2.5s pass | -| **Robustness (350m outlier)** | Handled | ST-5 | Continue pass | -| **Sharp turns (<5% overlap)** | Handled | ST-6 | Skip-frame pass | -| **Satellite validation** | <10% outliers | FAT-1-3 | <10% pass | - ---- - -## 6. Success Metrics - -**Green Light Criteria** (Ready for production): -- ✅ All unit tests pass (100%) -- ✅ All integration tests pass (100%) -- ✅ All system tests pass (100%) -- ✅ FAT-1 and FAT-2 pass acceptance criteria -- ✅ FAT-3 shows graceful degradation -- ✅ <10% code defects discovered in field trials -- ✅ Performance meets SLA consistently - -**Yellow Light Criteria** (Conditional deployment): -- ⚠ 85-89% of acceptance criteria met -- ⚠ Minor issues in edge cases -- ⚠ Requires workaround documentation -- ⚠ Re-test after fixes - -**Red Light Criteria** (Do not deploy): -- ❌ <85% of acceptance criteria met -- ❌ Critical failures in core functionality -- ❌ Safety/security concerns -- ❌ Cannot meet latency or accuracy targets diff --git a/docs/01_solution/02_solution_draft.md b/docs/01_solution/02_solution_draft.md new file mode 100644 index 0000000..9d593c4 --- /dev/null +++ b/docs/01_solution/02_solution_draft.md @@ -0,0 +1,284 @@ +# **GEORTOLS-SA UAV Image Geolocalization in IMU-Denied Environments** + +The GEORTOLS-SA system is an asynchronous, four-component software solution designed for deployment on an NVIDIA RTX 2060+ GPU. It is architected from the ground up to handle the specific challenges of IMU-denied, scale-aware localization and real-time streaming output. + +### **Product Solution Description** + +* **Inputs:** + 1. A sequence of consecutively named images (FullHD to 6252x4168). + 2. The absolute GPS coordinate (Latitude, Longitude) for the first image (Image 0). + 3. A pre-calibrated camera intrinsic matrix ($K$). + 4. The predefined, absolute metric altitude of the UAV ($H$, e.g., 900 meters). + 5. API access to the Google Maps satellite provider. +* **Outputs (Streaming):** + 1. **Initial Pose (T \< 5s):** A high-confidence, *metric-scale* estimate ($Pose\_N\_Est$) of the image's 6-DoF pose and GPS coordinate. This is sent to the user immediately upon calculation (AC-7, AC-8). + 2. **Refined Pose (T > 5s):** A globally-optimized pose ($Pose\_N\_Refined$) sent asynchronously as the back-end optimizer fuses data from the CVGL module (AC-8). + +### **Component Interaction Diagram and Data Flow** + +The system is architected as four parallel-processing components to meet the stringent real-time and refinement requirements. + +1. **Image Ingestion & Pre-processing:** This module receives the new, high-resolution Image_N. It immediately creates two copies: + * Image_N_LR (Low-Resolution, e.g., 1536x1024): This copy is immediately dispatched to the SA-VO Front-End for real-time processing. + * Image_N_HR (High-Resolution, 6.2K): This copy is stored and made available to the CVGL Module for its asynchronous, high-accuracy matching pipeline. +2. **Scale-Aware VO (SA-VO) Front-End (High-Frequency Thread):** This component's sole task is high-speed, *metric-scale* relative pose estimation. It matches Image_N_LR to Image_N-1_LR, computes the 6-DoF relative transform, and critically, uses the "known altitude" ($H$) constraint to recover the absolute scale (detailed in Section 3.0). It sends this high-confidence Relative_Metric_Pose to the Back-End. +3. **Cross-View Geolocalization (CVGL) Module (Low-Frequency, Asynchronous Thread):** This is a heavier, slower module. It takes Image_N (both LR and HR) and queries the Google Maps database to find an *absolute GPS pose*. When a high-confidence match is found, its Absolute_GPS_Pose is sent to the Back-End as a global "anchor" constraint. +4. **Trajectory Optimization Back-End (Central Hub):** This component manages the complete flight trajectory as a pose graph.10 It continuously fuses two distinct, high-quality data streams: + * **On receiving Relative_Metric_Pose (T \< 5s):** It appends this pose to the graph, calculates the Pose_N_Est, and **sends this initial result to the user (AC-7, AC-8 met)**. + * **On receiving Absolute_GPS_Pose (T > 5s):** It adds this as a high-confidence "global anchor" constraint 12, triggers a full graph re-optimization to correct any minor biases, and **sends the Pose_N_Refined to the user (AC-8 refinement met)**. + +### + +### **VO "Trust Model" of GEORTOLS-SA** + +In GEORTOLS-SA, the trust model: + +* The **SA-VO Front-End** is now *highly trusted* for its local, frame-to-frame *metric* accuracy. +* The **CVGL Module** remains *highly trusted* for its *global* (GPS) accuracy. + +Both components are operating in the same scale-aware, metric space. The Back-End's job is no longer to fix a broken, drifting VO. Instead, it performs a robust fusion of two independent, high-quality metric measurements.12 + +This model is self-correcting. If the user's predefined altitude $H$ is slightly incorrect (e.g., entered as 900m but is truly 880m), the SA-VO front-end will be *consistently* off by a small percentage. The periodic, high-confidence CVGL "anchors" will create a consistent, low-level "tension" in the pose graph. The graph optimizer (e.g., Ceres Solver) 3 will resolve this tension by slightly "pulling" the SA-VO poses to fit the global anchors, effectively *learning* and correcting for the altitude bias. This robust fusion is the key to meeting the 20-meter and 50-meter accuracy targets (AC-1, AC-2). + +## **3.0 Core Component: The Scale-Aware Visual Odometry (SA-VO) Front-End** + +This component is the new, critical engine of the system. Its sole task is to compute the *metric-scale* 6-DoF relative motion between consecutive frames, thereby eliminating scale drift at its source. + +### **3.1 Rationale and Mechanism for Per-Frame Scale Recovery** + +The SA-VO front-end implements a geometric algorithm to recover the absolute scale $s$ for *every* frame-to-frame transition. This algorithm directly leverages the query's "known altitude" ($H$) and "planar ground" constraints.5 + +The SA-VO algorithm for processing Image_N (relative to Image_N-1) is as follows: + +1. **Feature Matching:** Extract and match robust features between Image_N and Image_N-1 using the selected feature matcher (see Section 3.2). This yields a set of corresponding 2D pixel coordinates. +2. **Essential Matrix:** Use RANSAC (Random Sample Consensus) and the camera intrinsic matrix $K$ to compute the Essential Matrix $E$ from the "inlier" correspondences.2 +3. **Pose Decomposition:** Decompose $E$ to find the relative Rotation $R$ and the *unscaled* translation vector $t$, where the magnitude $||t||$ is fixed to 1.2 +4. **Triangulation:** Triangulate the 3D-world points $X$ for all inlier features using the unscaled pose $$.15 These 3D points ($X_i$) are now in a local, *unscaled* coordinate system (i.e., we know the *shape* of the point cloud, but not its *size*). +5. **Ground Plane Fitting:** The query states "terrain height can be neglected," meaning we assume a planar ground. A *second* RANSAC pass is performed, this time fitting a 3D plane to the set of triangulated 3D points $X$. The inliers to this RANSAC are identified as the ground points $X_g$.5 This method is highly robust as it does not rely on a single point, but on the consensus of all visible ground features.16 +6. **Unscaled Height ($h$):** From the fitted plane equation n^T X + d = 0, the parameter $d$ represents the perpendicular distance from the camera (at the coordinate system's origin) to the computed ground plane. This is our *unscaled* height $h$. +7. **Scale Computation:** We now have two values: the *real, metric* altitude $h$ (e.g., 900m) provided by the user, and our *computed, unscaled* altitude $h$. The absolute scale $s$ for this frame is the ratio of these two values: s = h / h. +8. **Metric Pose:** The final, metric-scale relative pose is $$, where the metric translation $T = s * t$. This high-confidence, scale-aware pose is sent to the Back-End. + +### **3.2 Feature Matching Sub-System Analysis** + +The success of the SA-VO algorithm depends *entirely* on the quality of the initial feature matches, especially in the low-texture agricultural terrain specified in the query. The system requires a matcher that is both robust (for sparse textures) and extremely fast (for AC-7). + +The initial draft's choice of SuperGlue 17 is a strong, proven baseline. However, its successor, LightGlue 18, offers a critical, non-obvious advantage: **adaptivity**. + +The UAV flight is specified as *mostly* straight, with high overlap. Sharp turns (AC-4) are "rather an exception." This means \~95% of our image pairs are "easy" to match, while 5% are "hard." + +* SuperGlue uses a fixed-depth Graph Neural Network (GNN), spending the *same* (large) amount of compute on an "easy" pair as a "hard" pair.19 This is inefficient. +* LightGlue is *adaptive*.19 For an easy, high-overlap pair, it can exit early (e.g., at layer 3/9), returning a high-confidence match in a fraction of the time. For a "hard" low-overlap pair, it will use its full depth to get the best possible result.19 + +By using LightGlue, the system saves *enormous* amounts of computational budget on the 95% of "easy" frames, ensuring it *always* meets the \<5s budget (AC-7) and reserving that compute for the harder CVGL tasks. LightGlue is a "plug-and-play replacement" 19 that is faster, more accurate, and easier to train.19 + +### **Table 1: Analysis of State-of-the-Art Feature Matchers (For SA-VO Front-End)** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **SuperPoint + SuperGlue** 17 | - SOTA robustness in low-texture, high-blur conditions. - GNN reasons about 3D scene context. - Proven in real-time SLAM systems.22 | - Computationally heavy (fixed-depth GNN). - Slower than LightGlue.19 - Training is complex.19 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.25 | **Good.** A solid, baseline choice. Meets robustness needs but will heavily tax the \<5s time budget (AC-7). | +| **SuperPoint + LightGlue** 18 | - **Adaptive Depth:** Faster on "easy" pairs, more accurate on "hard" pairs.19 - **Faster & Lighter:** Outperforms SuperGlue on speed and accuracy.19 - **Easier to Train:** Simpler architecture and loss.19 - Direct plug-and-play replacement for SuperGlue. | - Newer, less long-term-SLAM-proven than SuperGlue (though rapidly being adopted). | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.28 | **Excellent (Selected).** The adaptive nature is *perfect* for this problem. It saves compute on the 95% of easy (straight) frames, preserving the budget for the 5% of hard (turn) frames, maximizing our ability to meet AC-7. | + +### **3.3 Selected Approach (SA-VO): SuperPoint + LightGlue** + +The SA-VO front-end will be built using: + +* **Detector:** **SuperPoint** 24 to detect sparse, robust features on the Image_N_LR. +* **Matcher:** **LightGlue** 18 to match features from Image_N_LR to Image_N-1_LR. + +This combination provides the SOTA robustness required for low-texture fields, while LightGlue's adaptive performance 19 is the key to meeting the \<5s (AC-7) real-time requirement. + +## **4.0 Global Anchoring: The Cross-View Geolocalization (CVGL) Module** + +With the SA-VO front-end handling metric scale, the CVGL module's task is refined. Its purpose is no longer to *correct scale*, but to provide *absolute global "anchor" poses*. This corrects for any accumulated bias (e.g., if the $h$ prior is off by 5m) and, critically, *relocalizes* the system after a persistent tracking loss (AC-4). + +### **4.1 Hierarchical Retrieval-and-Match Pipeline** + +This module runs asynchronously and is computationally heavy. A brute-force search against the entire Google Maps database is impossible. A two-stage hierarchical pipeline is required: + +1. **Stage 1: Coarse Retrieval.** This is treated as an image retrieval problem.29 + * A **Siamese CNN** 30 (or similar Dual-CNN architecture) is used to generate a compact "embedding vector" (a digital signature) for the Image_N_LR. + * An embedding database will be pre-computed for *all* Google Maps satellite tiles in the specified Eastern Ukraine operational area. + * The UAV image's embedding is then used to perform a very fast (e.g., FAISS library) similarity search against the satellite database, returning the *Top-K* (e.g., K=5) most likely-matching satellite tiles. +2. **Stage 2: Fine-Grained Pose.** + * *Only* for these Top-5 candidates, the system performs the heavy-duty **SuperPoint + LightGlue** matching. + * This match is *not* Image_N -> Image_N-1. It is Image_N -> Satellite_Tile_K. + * The match with the highest inlier count and lowest reprojection error (MRE \< 1.0, AC-10) is used to compute the precise 6-DoF pose of the UAV relative to that georeferenced satellite tile. This yields the final Absolute_GPS_Pose. + +### **4.2 Critical Insight: Solving the Oblique-to-Nadir "Domain Gap"** + +A critical, unaddressed failure mode exists. The query states the camera is **"not autostabilized"** [User Query]. On a fixed-wing UAV, this guarantees that during a bank or sharp turn (AC-4), the camera will *not* be nadir (top-down). It will be *oblique*, capturing the ground from an angle. The Google Maps reference, however, is *perfectly nadir*.32 + +This creates a severe "domain gap".33 A CVGL system trained *only* to match nadir-to-nadir images will *fail* when presented with an oblique UAV image.34 This means the CVGL module will fail *precisely* when it is needed most: during the sharp turns (AC-4) when SA-VO tracking is also lost. + +The solution is to *close this domain gap* during training. Since the real-world UAV images will be oblique, the network must be taught to match oblique views to nadir ones. + +Solution: Synthetic Data Generation for Robust Training +The Stage 1 Siamese CNN 30 must be trained on a custom, synthetically-generated dataset.37 The process is as follows: + +1. Acquire nadir satellite imagery and a corresponding Digital Elevation Model (DEM) for the operational area. +2. Use this data to *synthetically render* the nadir satellite imagery from a wide variety of *oblique* viewpoints, simulating the UAV's roll and pitch.38 +3. Create thousands of training pairs, each consisting of (Nadir_Satellite_Tile, Synthetically_Oblique_Tile_Angle_30_Deg). +4. Train the Siamese network 29 to learn that these two images—despite their *vastly* different appearances—are a *match*. + +This process teaches the retrieval network to be *viewpoint-invariant*.35 It learns to ignore perspective distortion and match the true underlying ground features (road intersections, field boundaries). This is the *only* way to ensure the CVGL module can robustly relocalize the UAV during a sharp turn (AC-4). + +## **5.0 Trajectory Fusion: The Robust Optimization Back-End** + +This component is the system's central "brain." It runs continuously, fusing all incoming measurements (high-frequency/metric-scale SA-VO poses, low-frequency/globally-absolute CVGL poses) into a single, globally consistent trajectory. This component's design is dictated by the requirements for streaming (AC-8), refinement (AC-8), and outlier-rejection (AC-3). + +### **5.1 Selected Strategy: Incremental Pose-Graph Optimization** + +The user's requirements for "results...appear immediately" and "system could refine existing calculated results" [User Query] are a textbook description of a real-time SLAM back-end.11 A batch Structure from Motion (SfM) process, which requires all images upfront and can take hours, is unsuitable for the primary system. + +### **Table 2: Analysis of Trajectory Optimization Strategies** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Incremental SLAM (Pose-Graph Optimization)** (g2o 13, Ceres Solver 10, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates (AC-7). - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (CVGL) data arrives (AC-8).11 - **Robust:** Can handle outliers via robust kernels.39 | - Initial estimate is less accurate than a full batch process. - Can drift *if* not anchored (though our SA-VO minimizes this). | - A graph optimization library (g2o, Ceres). - A robust cost function.41 | **Excellent (Selected).** This is the *only* architecture that satisfies all user requirements for real-time streaming and asynchronous refinement. | +| **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory. | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails AC-7 and AC-8 completely. | - All images must be available before processing starts. - High RAM and CPU. | **Good (as an *Optional* Post-Processing Step).** Unsuitable as the primary online system, but could be offered as an optional, high-accuracy "Finalize Trajectory" batch process after the flight. | + +The system's back-end will be built as an **Incremental Pose-Graph Optimizer** using **Ceres Solver**.10 Ceres is selected due to its large user community, robust documentation, excellent support for robust loss functions 10, and proven scalability for large-scale nonlinear least-squares problems.42 + +### **5.2 Mechanism for Automatic Outlier Rejection (AC-3, AC-5)** + +The system must "correctly continue the work even in the presence of up to 350 meters of an outlier" (AC-3). A standard least-squares optimizer would be catastrophically corrupted by this event, as it would try to *average* this 350m error, pulling the *entire* 300km trajectory out of alignment. + +A modern optimizer does not need to use brittle, hand-coded if-then logic to reject outliers. It can *mathematically* and *automatically* down-weight them using **Robust Loss Functions (Kernels)**.41 + +The mechanism is as follows: + +1. The Ceres Back-End 10 maintains a graph of nodes (poses) and edges (constraints, or measurements). +2. A 350m outlier (AC-3) will create an edge with a *massive* error (residual). +3. A standard (quadratic) loss function $cost(error) = error^2$ would create a *catastrophic* cost, forcing the optimizer to ruin the entire graph to accommodate it. +4. Instead, the system will wrap its cost functions in a **Robust Loss Function**, such as **CauchyLoss** or **HuberLoss**.10 +5. A robust loss function behaves quadratically for small errors (which it tries hard to fix) but becomes *sub-linear* for large errors. When it "sees" the 350m error, it mathematically *down-weights its influence*.43 +6. The optimizer effectively *acknowledges* the 350m error but *refuses* to pull the entire graph to fix this one "insane" measurement. It automatically, and gracefully, treats the outlier as a "lost cause" and optimizes the 99.9% of "sane" measurements. This is the modern, robust solution to AC-3 and AC-5. + +## **6.0 High-Resolution (6.2K) and Performance Optimization** + +The system must simultaneously handle massive 6252x4168 (26-Megapixel) images and run on a modest RTX 2060 GPU [User Query] with a \<5s time limit (AC-7). These are opposing constraints. + +### **6.1 The Multi-Scale Patch-Based Processing Pipeline** + +Running *any* deep learning model (SuperPoint, LightGlue) on a full 6.2K image will be impossibly slow and will *immediately* cause a CUDA Out-of-Memory (OOM) error on a 6GB RTX 2060.45 + +The solution is not to process the full 6.2K image in real-time. Instead, a **multi-scale, patch-based pipeline** is required, where different components use the resolution best suited to their task.46 + +1. **For SA-VO (Real-time, \<5s):** The SA-VO front-end is concerned with *motion*, not fine-grained detail. The 6.2K Image_N_HR is *immediately* downscaled to a manageable 1536x1024 (Image_N_LR). The entire SA-VO (SuperPoint + LightGlue) pipeline runs *only* on this low-resolution, fast-to-process image. This is how the \<5s (AC-7) budget is met. +2. **For CVGL (High-Accuracy, Async):** The CVGL module, which runs asynchronously, is where the 6.2K detail is *selectively* used to meet the 20m (AC-2) accuracy target. It uses a "coarse-to-fine" 48 approach: + * **Step A (Coarse):** The Siamese CNN 30 runs on the *downscaled* 1536px Image_N_LR to get a coarse [Lat, Lon] guess. + * **Step B (Fine):** The system uses this coarse guess to fetch the corresponding *high-resolution* satellite tile. + * **Step C (Patching):** The system runs the SuperPoint detector on the *full 6.2K* Image_N_HR to find the Top 100 *most confident* feature keypoints. It then extracts 100 small (e.g., 256x256) *patches* from the full-resolution image, centered on these keypoints.49 + * **Step D (Matching):** The system then matches *these small, full-resolution patches* against the high-res satellite tile. + +This hybrid method provides the best of both worlds: the fine-grained matching accuracy 50 of the 6.2K image, but without the catastrophic OOM errors or performance penalties.45 + +### **6.2 Real-Time Deployment with TensorRT** + +PyTorch is a research and training framework. Its default inference speed, even on an RTX 2060, is often insufficient to meet a \<5s production requirement.23 + +For the final production system, the key neural networks (SuperPoint, LightGlue, Siamese CNN) *must* be converted from their PyTorch-native format into a highly-optimized **NVIDIA TensorRT engine**. + +* **Benefits:** TensorRT is an inference optimizer that applies graph optimizations, layer fusion, and precision reduction (e.g., to FP16).52 This can achieve a 2x-4x (or more) speedup over native PyTorch.28 +* **Deployment:** The resulting TensorRT engine can be deployed via a C++ API 25, which is far more suitable for a robust, high-performance production system. + +This conversion is a *mandatory* deployment step. It is what makes a 2-second inference (well within the 5-second AC-7 budget) *achievable* on the specified RTX 2060 hardware. + +## **7.0 System Robustness: Failure Mode and Logic Escalation** + +The system's logic is designed as a multi-stage escalation process to handle the specific failure modes in the acceptance criteria (AC-3, AC-4, AC-6), ensuring the >95% registration rate (AC-9). + +### **Stage 1: Normal Operation (Tracking)** + +* **Condition:** SA-VO(N-1 -> N) succeeds. The LightGlue match is high-confidence, and the computed scale $s$ is reasonable. +* **Logic:** + 1. The Relative_Metric_Pose is sent to the Back-End. + 2. The Pose_N_Est is calculated and sent to the user (\<5s). + 3. The CVGL module is queued to run asynchronously to provide a Pose_N_Refined at a later time. + +### **Stage 2: Transient SA-VO Failure (AC-3 Outlier Handling)** + +* **Condition:** SA-VO(N-1 -> N) fails. This could be a 350m outlier (AC-3), a severely blurred image, or an image with no features (e.g., over a cloud). The LightGlue match fails, or the computed scale $s$ is nonsensical. +* **Logic (Frame Skipping):** + 1. The system *buffers* Image_N and marks it as "tentatively lost." + 2. When Image_N+1 arrives, the SA-VO front-end attempts to "bridge the gap" by matching SA-VO(N-1 -> N+1). + 3. **If successful:** A Relative_Metric_Pose for N+1 is found. Image_N is officially marked as a rejected outlier (AC-5). The system "correctly continues the work" (AC-3 met). + 4. **If fails:** The system repeats for SA-VO(N-1 -> N+2). + 5. If this "bridging" fails for 3 consecutive frames, the system concludes it is not a transient outlier but a persistent tracking loss, and escalates to Stage 3. + +### **Stage 3: Persistent Tracking Loss (AC-4 Sharp Turn Handling)** + +* **Condition:** The "frame-skipping" in Stage 2 fails. This is the "sharp turn" scenario [AC-4] where there is \<5% overlap between Image_N-1 and Image_N+k. +* **Logic (Multi-Map "Chunking"):** + 1. The Back-End declares a "Tracking Lost" state at Image_N and creates a *new, independent map chunk* ("Chunk 2"). + 2. The SA-VO Front-End is re-initialized at Image_N and begins populating this new chunk, tracking SA-VO(N -> N+1), SA-VO(N+1 -> N+2), etc. + 3. Because the front-end is **Scale-Aware**, this new "Chunk 2" is *already in metric scale*. It is a "floating island" of *known size and shape*; it just is not anchored to the global GPS map. +* **Resolution (Asynchronous Relocalization):** + 1. The **CVGL Module** is now tasked, high-priority, to find a *single* Absolute_GPS_Pose for *any* frame in this new "Chunk 2". + 2. Once the CVGL module (which is robust to oblique views, per Section 4.2) finds one (e.g., for Image_N+20), the Back-End has all the information it needs. + 3. **Merging:** The Back-End calculates the simple 6-DoF transformation (3D translation and rotation, scale=1) to align all of "Chunk 2" and merge it with "Chunk 1". This robustly handles the "correctly continue the work" criterion (AC-4). + +### **Stage 4: Catastrophic Failure (AC-6 User Intervention)** + +* **Condition:** The system has entered Stage 3 and is building "Chunk 2," but the **CVGL Module** has *also* failed for a prolonged period (e.g., 20% of the route, or 50+ consecutive frames). This is the "worst-case" scenario (e.g., heavy clouds *and* over a large, featureless lake). The system is "absolutely incapable" [User Query]. +* **Logic:** + 1. The system has a metric-scale "Chunk 2" but zero idea where it is in the world. + 2. The Back-End triggers the AC-6 flag. +* **Resolution (User Input):** + 1. The UI prompts the user: "Tracking lost. Please provide a coarse location for the *current* image." + 2. The UI displays the last known good image (from Chunk 1) and the new, "lost" image (e.g., Image_N+50). + 3. The user clicks *one point* on the satellite map. + 4. This user-provided [Lat, Lon] is *not* taken as ground truth. It is fed to the CVGL module as a *strong prior*, drastically narrowing its search area from "all of Ukraine" to "a 10km-radius circle." + 5. This allows the CVGL module to re-acquire a lock, which triggers the Stage 3 merge, and the system continues. + +## **8.0 Output Generation and Validation Strategy** + +This section details how the final user-facing outputs are generated and how the system's compliance with all 10 acceptance criteria will be validated. + +### **8.1 Generating Object-Level GPS (from Pixel Coordinate)** + +This meets the requirement to find the "coordinates of the center of any object in these photos" [User Query]. The system provides this via a **Ray-Plane Intersection** method. + +* **Inputs:** + 1. The user clicks pixel coordinate $(u,v)$ on Image_N. + 2. The system retrieves the refined, global 6-DoF pose $$ for Image_N from the Back-End. + 3. The system uses the known camera intrinsic matrix $K$. + 4. The system uses the known *global ground-plane equation* (e.g., $Z=150m$, based on the predefined altitude and start coordinate). +* **Method:** + 1. **Un-project Pixel:** The 2D pixel $(u,v)$ is un-projected into a 3D ray *direction* vector $d_{cam}$ in the camera's local coordinate system: $d_{cam} \= K^{-1} \cdot [u, v, 1]^T$. + 2. **Transform Ray:** This ray direction is transformed into the *global* coordinate system using the pose's rotation matrix: $d_{global} \= R \cdot d_{cam}$. + 3. **Define Ray:** A 3D ray is now defined, originating at the camera's global position $T$ (from the pose) and traveling in the direction $d_{global}$. + 4. **Intersect:** The system solves the 3D line-plane intersection equation for this ray and the known global ground plane (e.g., find the intersection with $Z=150m$). + 5. **Result:** The 3D intersection point $(X, Y, Z)$ is the *metric* world coordinate of the object on the ground. + 6. **Convert:** This $(X, Y, Z)$ world coordinate is converted to a [Latitude, Longitude, Altitude] GPS coordinate. This process is immediate and can be performed for any pixel on any geolocated image. + +### **8.2 Rigorous Validation Methodology** + +A comprehensive test plan is required to validate all 10 acceptance criteria. The foundation of this is the creation of a **Ground-Truth Test Harness**. + +* **Test Harness:** + 1. **Ground-Truth Data:** Several test flights will be conducted in the operational area using a UAV equipped with a high-precision RTK/PPK GPS. This provides the "real GPS" (ground truth) for every image. + 2. **Test Datasets:** Multiple test datasets will be curated from this ground-truth data: + * Test_Baseline_1000: A standard 1000-image flight. + * Test_Outlier_350m (AC-3): Test_Baseline_1000 with a single image from 350m away manually inserted at frame 30. + * Test_Sharp_Turn_5pct (AC-4): A sequence where frames 20-24 are manually deleted, simulating a \<5% overlap jump. + * Test_Catastrophic_Fail_20pct (AC-6): A sequence with 200 (20%) consecutive "bad" frames (e.g., pure sky, lens cap) inserted. + * Test_Full_3000: A full 3000-image sequence to test scalability and memory usage. +* **Test Cases:** + * **Test_Accuracy (AC-1, AC-2, AC-5, AC-9):** + * Run Test_Baseline_1000. A test script will compare the system's *final refined GPS output* for each image against its *ground-truth GPS*. + * ASSERT (count(errors \< 50m) / 1000) \geq 0.80 (AC-1) + * ASSERT (count(errors \< 20m) / 1000) \geq 0.60 (AC-2) + * ASSERT (count(un-localized_images) / 1000) \< 0.10 (AC-5) + * ASSERT (count(localized_images) / 1000) > 0.95 (AC-9) + * **Test_MRE (AC-10):** + * ASSERT (BackEnd.final_MRE) \< 1.0 (AC-10) + * **Test_Performance (AC-7, AC-8):** + * Run Test_Full_3000 on the minimum-spec RTX 2060. + * Log timestamps for "Image In" -> "Initial Pose Out". ASSERT average_time \< 5.0s (AC-7). + * Log the output stream. ASSERT that >80% of images receive *two* poses: an "Initial" and a "Refined" (AC-8). + * **Test_Robustness (AC-3, AC-4, AC-6):** + * Run Test_Outlier_350m. ASSERT the system correctly continues and the final trajectory error for Image_31 is \< 50m (AC-3). + * Run Test_Sharp_Turn_5pct. ASSERT the system logs "Tracking Lost" and "Maps Merged," and the final trajectory is complete and accurate (AC-4). + * Run Test_Catastrophic_Fail_20pct. ASSERT the system correctly triggers the "ask for user input" event (AC-6). \ No newline at end of file diff --git a/docs/01_solution/03_solution_draft.md b/docs/01_solution/03_solution_draft.md new file mode 100644 index 0000000..d691643 --- /dev/null +++ b/docs/01_solution/03_solution_draft.md @@ -0,0 +1,259 @@ +**GEORTEX-R: A Geospatial-Temporal Robust Extraction System for IMU-Denied UAV Geolocalization** + +## **1.0 GEORTEX-R: System Architecture and Data Flow** + +The GEORTEX-R system is an asynchronous, three-component software solution designed for deployment on an NVIDIA RTX 2060+ GPU. It is architected from the ground up to handle the specific, demonstrated challenges of IMU-denied localization in *non-planar terrain* (as seen in Images 1-9) and *temporally-divergent* (outdated) reference maps (AC-5). + +The system's core design principle is the *decoupling of unscaled relative motion from global metric scale*. The front-end estimates high-frequency, robust, but *unscaled* motion. The back-end asynchronously provides sparse, high-confidence *metric* and *geospatial* anchors. The central hub fuses these two data streams into a single, globally-optimized, metric-scale trajectory. + +### **1.1 Inputs** + +1. **Image Sequence:** Consecutively named images (FullHD to 6252x4168). +2. **Start Coordinate (Image 0):** A single, absolute GPS coordinate (Latitude, Longitude) for the first image. +3. **Camera Intrinsics ($K$):** A pre-calibrated camera intrinsic matrix. +4. **Altitude Prior ($H_{prior}$):** The *approximate* predefined metric altitude (e.g., 900 meters). This is used as a *prior* (a hint) for optimization, *not* a hard constraint. +5. **Geospatial API Access:** Credentials for an on-demand satellite and DEM provider (e.g., Copernicus, EOSDA). + +### **1.2 Streaming Outputs** + +1. **Initial Pose ($Pose\\_N\\_Est$):** An *unscaled* pose estimate. This is sent immediately to the UI for real-time visualization of the UAV's *path shape* (AC-7, AC-8). +2. **Refined Pose ($Pose\\_N\\_Refined$) [Asynchronous]:** A globally-optimized, *metric-scale* 7-DoF pose (X, Y, Z, Qx, Qy, Qz, Qw) and its corresponding [Lat, Lon, Alt] coordinate. This is sent to the user whenever the Trajectory Optimization Hub re-converges, updating all past poses (AC-1, AC-2, AC-8). + +### **1.3 Component Interaction and Data Flow** + +The system is architected as three parallel-processing components: + +1. **Image Ingestion & Pre-processing:** This module receives the new Image_N (up to 6.2K). It creates two copies: + * Image_N_LR (Low-Resolution, e.g., 1536x1024): Dispatched *immediately* to the V-SLAM Front-End for real-time processing. + * Image_N_HR (High-Resolution, 6.2K): Stored for asynchronous use by the Geospatial Anchoring Back-End (GAB). +2. **V-SLAM Front-End (High-Frequency Thread):** This component's sole task is high-speed, *unscaled* relative pose estimation. It tracks Image_N_LR against a *local map of keyframes*. It performs local bundle adjustment to minimize drift 12 and maintains a co-visibility graph of all keyframes. It sends Relative_Unscaled_Pose estimates to the Trajectory Optimization Hub (TOH). +3. **Geospatial Anchoring Back-End (GAB) (Low-Frequency, Asynchronous Thread):** This is the system's "anchor." When triggered by the TOH, it fetches *on-demand* geospatial data (satellite imagery and DEMs) from an external API.3 It then performs a robust *hybrid semantic-visual* search 5 to find an *absolute, metric, global pose* for a given keyframe, robust to outdated maps (AC-5) 5 and oblique views (AC-4).14 This Absolute_Metric_Anchor is sent to the TOH. +4. **Trajectory Optimization Hub (TOH) (Central Hub):** This component manages the complete flight trajectory as a **Sim(3) pose graph** (7-DoF). It continuously fuses two distinct data streams: + * **On receiving Relative_Unscaled_Pose (T \< 5s):** It appends this pose to the graph, calculates the Pose_N_Est, and sends this *unscaled* initial result to the user (AC-7, AC-8 met). + * **On receiving Absolute_Metric_Anchor (T > 5s):** This is the critical event. It adds this as a high-confidence *global metric constraint*. This anchor creates "tension" in the graph, which the optimizer (Ceres Solver 15) resolves by finding the *single global scale factor* that best fits all V-SLAM and CVGL measurements. It then triggers a full graph re-optimization, "stretching" the entire trajectory to the correct metric scale, and sends the new Pose_N_Refined stream to the user for all affected poses (AC-1, AC-2, AC-8 refinement met). + +## **2.0 Core Component: The High-Frequency V-SLAM Front-End** + +This component's sole task is to robustly and accurately compute the *unscaled* 6-DoF relative motion of the UAV and build a geometrically-consistent map of keyframes. It is explicitly designed to be more robust to drift than simple frame-to-frame odometry. + +### **2.1 Rationale: Keyframe-Based Monocular SLAM** + +The choice of a keyframe-based V-SLAM front-end over a frame-to-frame VO is deliberate and critical for system robustness. + +* **Drift Mitigation:** Frame-to-frame VO is "prone to drift accumulation due to errors introduced by each frame-to-frame motion estimation".13 A single poor match permanently corrupts all future poses. +* **Robustness:** A keyframe-based system tracks new images against a *local map* of *multiple* previous keyframes, not just Image_N-1. This provides resilience to transient failures (e.g., motion blur, occlusion). +* **Optimization:** This architecture enables "local bundle adjustment" 12, a process where a sliding window of recent keyframes is continuously re-optimized, actively minimizing error and drift *before* it can accumulate. +* **Relocalization:** This architecture possesses *innate relocalization capabilities* (see Section 6.3), which is the correct, robust solution to the "sharp turn" (AC-4) requirement. + +### **2.2 Feature Matching Sub-System** + +The success of the V-SLAM front-end depends entirely on high-quality feature matches, especially in the sparse, low-texture agricultural terrain seen in the provided images (e.g., Image 6, Image 7). The system requires a matcher that is robust (for sparse textures 17) and extremely fast (for AC-7). + +The selected approach is **SuperPoint + LightGlue**. + +* **SuperPoint:** A SOTA (State-of-the-Art) feature detector proven to find robust, repeatable keypoints in challenging, low-texture conditions 17 +* **LightGlue:** A highly optimized GNN-based matcher that is the successor to SuperGlue 19 + +The key advantage of selecting LightGlue 19 over SuperGlue 20 is its *adaptive nature*. The query states sharp turns (AC-4) are "rather an exception." This implies \~95% of image pairs are "easy" (high-overlap, straight flight) and 5% are "hard" (low-overlap, turns). SuperGlue uses a fixed-depth GNN, spending the *same* large amount of compute on an "easy" pair as a "hard" one. LightGlue is *adaptive*.19 For an "easy" pair, it can exit its GNN early, returning a high-confidence match in a fraction of the time. This saves *enormous* computational budget on the 95% of "easy" frames, ensuring the system *always* meets the \<5s budget (AC-7) and reserving that compute for the GAB. + +#### **Table 1: Analysis of State-of-the-Art Feature Matchers (For V-SLAM Front-End)** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **SuperPoint + SuperGlue** 20 | - SOTA robustness in low-texture, high-blur conditions. - GNN reasons about 3D scene context. - Proven in real-time SLAM systems. | - Computationally heavy (fixed-depth GNN). - Slower than LightGlue.19 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.21 | **Good.** A solid, baseline choice. Meets robustness needs but will heavily tax the \<5s time budget (AC-7). | +| **SuperPoint + LightGlue** 17 | - **Adaptive Depth:** Faster on "easy" pairs, more accurate on "hard" pairs.19 - **Faster & Lighter:** Outperforms SuperGlue on speed and accuracy. - SOTA "in practice" choice for large-scale matching.17 | - Newer, but rapidly being adopted and proven.21 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT.22 | **Excellent (Selected).** The adaptive nature is *perfect* for this problem. It saves compute on the 95% of easy (straight) frames, maximizing our ability to meet AC-7. | + +## **3.0 Core Component: The Geospatial Anchoring Back-End (GAB)** + +This component is the system's "anchor to reality." It runs asynchronously to provide the *absolute, metric-scale* constraints needed to solve the trajectory. It is an *on-demand* system that solves three distinct "domain gaps": the hardware/scale gap, the temporal gap, and the viewpoint gap. + +### **3.1 On-Demand Geospatial Data Retrieval** + +A "pre-computed database" for all of Eastern Ukraine is operationally unfeasible on laptop-grade hardware.1 This design is replaced by an on-demand, API-driven workflow. + +* **Mechanism:** When the TOH requests a global anchor, the GAB receives a *coarse* [Lat, Lon] estimate. The GAB then performs API calls to a geospatial data provider (e.g., EOSDA 3, Copernicus 8). +* **Dual-Retrieval:** The API query requests *two* distinct products for the specified Area of Interest (AOI): + 1. **Visual Tile:** A high-resolution (e.g., 30-50cm) satellite ortho-image.26 + 2. **Terrain Tile:** The corresponding **Digital Elevation Model (DEM)**, such as the Copernicus GLO-30 (30m resolution) or SRTM (30m).7 + +This "Dual-Retrieval" mechanism is the central, enabling synergy of the new architecture. The **Visual Tile** is used by the CVGL (Section 3.2) to find the *geospatial pose*. The **DEM Tile** is used by the *output module* (Section 7.1) to perform high-accuracy **Ray-DEM Intersection**, solving the final output accuracy problem. + +### **3.2 Hybrid Semantic-Visual Localization** + +The "temporal gap" (evidenced by burn scars in Images 1-9) and "outdated maps" (AC-5) makes a purely visual CVGL system unreliable.5 The GAB solves this using a robust, two-stage *hybrid* matching pipeline. + +1. **Stage 1: Coarse Visual Retrieval (Siamese CNN).** A lightweight Siamese CNN 14 is used to find the *approximate* location of the Image_N_LR *within* the large, newly-fetched satellite tile. This acts as a "candidate generator." +2. **Stage 2: Fine-Grained Semantic-Visual Fusion.** For the top candidates, the GAB performs a *dual-channel alignment*. + * **Visual Channel (Unreliable):** It runs SuperPoint+LightGlue on high-resolution *patches* (from Image_N_HR) against the satellite tile. This match may be *weak* due to temporal gaps.5 + * **Semantic Channel (Reliable):** It extracts *temporally-invariant* semantic features (e.g., road-vectors, field-boundaries, tree-cluster-polygons, lake shorelines) from *both* the UAV image (using a segmentation model) and the satellite/OpenStreetMap data.5 + * **Fusion:** A RANSAC-based optimizer finds the 6-DoF pose that *best aligns* this *hybrid* set of features. + +This hybrid approach is robust to the exact failure mode seen in the images. When matching Image 3 (burn scars), the *visual* LightGlue match will be poor. However, the *semantic* features (the dirt road, the tree line) are *unchanged*. The optimizer will find a high-confidence pose by *trusting the semantic alignment* over the poor visual alignment, thereby succeeding despite the "outdated map" (AC-5). + +### **3.3 Solution to Viewpoint Gap: Synthetic Oblique View Training** + +This component is critical for handling "sharp turns" (AC-4). The camera *will* be oblique, not nadir, during turns. + +* **Problem:** The GAB's Stage 1 Siamese CNN 14 will be matching an *oblique* UAV view to a *nadir* satellite tile. This "viewpoint gap" will cause a match failure.14 +* **Mechanism (Synthetic Data Generation):** The network must be trained for *viewpoint invariance*.28 + 1. Using the on-demand DEMs (fetched in 3.1) and satellite tiles, the system can *synthetically render* the satellite imagery from *any* roll, pitch, and altitude. + 2. The Siamese network is trained on (Nadir_Tile, Synthetic_Oblique_Tile) pairs.14 +* **Result:** This process teaches the network to match the *underlying ground features*, not the *perspective distortion*. It ensures the GAB can relocalize the UAV *precisely* when it is needed most: during a sharp, banking turn (AC-4) when VO tracking has been lost. + +## **4.0 Core Component: The Trajectory Optimization Hub (TOH)** + +This component is the system's central "brain." It runs continuously, fusing all measurements (high-frequency/unscaled V-SLAM, low-frequency/metric-scale GAB anchors) into a single, globally consistent trajectory. + +### **4.1 Incremental Sim(3) Pose-Graph Optimization** + +The "planar ground" SA-VO (Finding 1) is removed. This component is its replacement. The system must *discover* the global scale, not *assume* it. + +* **Selected Strategy:** An incremental pose-graph optimizer using **Ceres Solver**.15 +* **The Sim(3) Insight:** The V-SLAM front-end produces *unscaled* 6-DoF ($SE(3)$) relative poses. The GAB produces *metric-scale* 6-DoF ($SE(3)$) *absolute* poses. These cannot be directly combined. The graph must be optimized in **Sim(3) (7-DoF)**, which adds a *single global scale factor $s$* as an optimizable variable. +* **Mechanism (Ceres Solver):** + 1. **Nodes:** Each keyframe pose (7-DoF: $X, Y, Z, Qx, Qy, Qz, s$). + 2. **Edge 1 (V-SLAM):** A relative pose constraint between Keyframe_i and Keyframe_j. The error is computed in Sim(3). + 3. **Edge 2 (GAB):** An *absolute* pose constraint on Keyframe_k. This constraint *fixes* Keyframe_k's pose to the *metric* GPS coordinate and *fixes its scale $s$ to 1.0*. +* **Bootstrapping Scale:** The TOH graph "bootstraps" the scale.32 The GAB's $s=1.0$ anchor creates "tension" in the graph. The Ceres optimizer 15 resolves this tension by finding the *one* global scale $s$ for all V-SLAM nodes that minimizes the total error, effectively "stretching" the entire unscaled trajectory to fit the metric anchors. This is robust to *any* terrain.34 + +#### **Table 2: Analysis of Trajectory Optimization Strategies** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Incremental SLAM (Pose-Graph Optimization)** (Ceres Solver 15, g2o 35, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates (AC-7). - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (GAB) data arrives (AC-8).13 - **Robust:** Can handle outliers via robust kernels.15 | - Initial estimate is *unscaled* until a GAB anchor arrives. - Can drift *if* not anchored (though V-SLAM minimizes this). | - A graph optimization library (Ceres). - A robust cost function. | **Excellent (Selected).** This is the *only* architecture that satisfies all user requirements for real-time streaming and asynchronous refinement. | +| **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction and trajectory. | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails AC-7 and AC-8 completely. | - All images must be available before processing starts. - High RAM and CPU. | **Good (as an *Optional* Post-Processing Step).** Unsuitable as the primary online system, but could be offered as an optional, high-accuracy "Finalize Trajectory" batch process. | + +### **4.2 Automatic Outlier Rejection (AC-3, AC-5)** + +The system must handle 350m outliers (AC-3) and \<10% bad GAB matches (AC-5). + +* **Mechanism (Robust Loss Functions):** A standard least-squares optimizer (like Ceres 15) would be catastrophically corrupted by a 350m error. The solution is to wrap *all* constraints in a **Robust Loss Function (e.g., HuberLoss, CauchyLoss)**.15 +* **Result:** A robust loss function mathematically *down-weights* the influence of constraints with large errors. When it "sees" the 350m error (AC-3), it effectively acknowledges the measurement but *refuses* to pull the entire 3000-image trajectory to fit this one "insane" data point. It automatically and gracefully *ignores* the outlier, optimizing the 99.9% of "sane" measurements. This is the modern, robust solution to AC-3 and AC-5. + +## **5.0 High-Performance Compute & Deployment** + +The system must run on an RTX 2060 (AC-7) and process 6.2K images. These are opposing constraints. + +### **5.1 Multi-Scale, Patch-Based Processing Pipeline** + +Running deep learning models (SuperPoint, LightGlue) on a full 6.2K (26-Megapixel) image will cause a CUDA Out-of-Memory (OOM) error and be impossibly slow. + +* **Mechanism (Coarse-to-Fine):** + 1. **For V-SLAM (Real-time, \<5s):** The V-SLAM front-end (Section 2.0) runs *only* on the Image_N_LR (e.g., 1536x1024) copy. This is fast enough to meet the AC-7 budget. + 2. **For GAB (High-Accuracy, Async):** The GAB (Section 3.0) uses the full-resolution Image_N_HR *selectively* to meet the 20m accuracy (AC-2). + * It first runs its coarse Siamese CNN 27 on the Image_N_LR. + * It then runs the SuperPoint detector on the *full 6.2K* image to find the *most confident* feature keypoints. + * It then extracts small, 256x256 *patches* from the *full-resolution* image, centered on these keypoints. + * It matches *these small, full-resolution patches* against the high-res satellite tile. +* **Result:** This hybrid method provides the fine-grained matching accuracy of the 6.2K image (needed for AC-2) without the catastrophic OOM errors or performance penalties. + +### **5.2 Mandatory Deployment: NVIDIA TensorRT Acceleration** + +PyTorch is a research framework. For production, its inference speed is insufficient. + +* **Requirement:** The key neural networks (SuperPoint, LightGlue, Siamese CNN) *must* be converted from PyTorch into a highly-optimized **NVIDIA TensorRT engine**. +* **Research Validation:** 23 demonstrates this process for LightGlue, achieving "2x-4x speed gains over compiled PyTorch." 22 and 21 provide open-source repositories for SuperPoint+LightGlue conversion to ONNX and TensorRT. +* **Result:** This is not an "optional" optimization. It is a *mandatory* deployment step. This conversion (which applies layer fusion, graph optimization, and FP16 precision) is what makes achieving the \<5s (AC-7) performance *possible* on the specified RTX 2060 hardware.36 + +## **6.0 System Robustness: Failure Mode Escalation Logic** + +This logic defines the system's behavior during real-world failures, ensuring it meets criteria AC-3, AC-4, AC-6, and AC-9. + +### **6.1 Stage 1: Normal Operation (Tracking)** + +* **Condition:** V-SLAM front-end (Section 2.0) is healthy. +* **Logic:** + 1. V-SLAM successfully tracks Image_N_LR against its local keyframe map. + 2. A new Relative_Unscaled_Pose is sent to the TOH. + 3. TOH sends Pose_N_Est (unscaled) to the user (\<5s). + 4. If Image_N is selected as a new keyframe, the GAB (Section 3.0) is *queued* to find an Absolute_Metric_Anchor for it, which will trigger a Pose_N_Refined update later. + +### **6.2 Stage 2: Transient VO Failure (Outlier Rejection)** + +* **Condition:** Image_N is unusable (e.g., severe blur, sun-glare, 350m outlier per AC-3). +* **Logic (Frame Skipping):** + 1. V-SLAM front-end fails to track Image_N_LR against the local map. + 2. The system *discards* Image_N (marking it as a rejected outlier, AC-5). + 3. When Image_N+1 arrives, the V-SLAM front-end attempts to track it against the *same* local keyframe map (from Image_N-1). + 4. **If successful:** Tracking resumes. Image_N is officially an outlier. The system "correctly continues the work" (AC-3 met). + 5. **If fails:** The system repeats for Image_N+2, N+3. If this fails for \~5 consecutive frames, it escalates to Stage 3. + +### **6.3 Stage 3: Persistent VO Failure (Relocalization)** + +* **Condition:** Tracking is lost for multiple frames. This is the "sharp turn" (AC-4) or "low overlap" (AC-4) scenario. +* **Logic (Keyframe-Based Relocalization):** + 1. The V-SLAM front-end declares "Tracking Lost." + 2. **Critically:** It does *not* create a "new map chunk." + 3. Instead, it enters **Relocalization Mode**. For every new Image_N+k, it extracts features (SuperPoint) and queries the *entire* existing database of past keyframes for a match. +* **Resolution:** The UAV completes its sharp turn. Image_N+5 now has high overlap with Image_N-10 (from *before* the turn). + 1. The relocalization query finds a strong match. + 2. The V-SLAM front-end computes the 6-DoF pose of Image_N+5 relative to the *existing map*. + 3. Tracking is *resumed* seamlessly. The system "correctly continues the work" (AC-4 met). This is vastly more robust than the previous "map-merging" logic. + +### **6.4 Stage 4: Catastrophic Failure (User Intervention)** + +* **Condition:** The system is in Stage 3 (Lost), but *also*, the **GAB (Section 3.0) has failed** to find *any* global anchors for a prolonged period (e.g., 20% of the route). This is the "absolutely incapable" scenario (AC-6), (e.g., heavy fog *and* over a featureless ocean). +* **Logic:** + 1. The system has an *unscaled* trajectory, and *zero* idea where it is in the world. + 2. The TOH triggers the AC-6 flag. +* **Resolution (User-Aided Prior):** + 1. The UI prompts the user: "Tracking lost. Please provide a coarse location for the *current* image." + 2. The user clicks *one point* on a map. + 3. This [Lat, Lon] is *not* taken as ground truth. It is fed to the **GAB (Section 3.1)** as a *strong prior* for its on-demand API query. + 4. This narrows the GAB's search area from "all of Ukraine" to "a 5km radius." This *guarantees* the GAB's Dual-Retrieval (Section 3.1) will fetch the *correct* satellite and DEM tiles, allowing the Hybrid Matcher (Section 3.2) to find a high-confidence Absolute_Metric_Anchor, which in turn re-scales (Section 4.1) and relocalizes the entire trajectory. + +## **7.0 Output Generation and Validation Strategy** + +This section details how the final user-facing outputs are generated, specifically solving the "planar ground" output flaw, and how the system's compliance with all 10 ACs will be validated. + +### **7.1 High-Accuracy Object Geolocalization via Ray-DEM Intersection** + +The "Ray-Plane Intersection" method is inaccurate for non-planar terrain 37 and is replaced with a high-accuracy ray-tracing method. This is the correct method for geolocating an object on the *non-planar* terrain visible in Images 1-9. + +* **Inputs:** + 1. User clicks pixel coordinate $(u,v)$ on Image_N. + 2. System retrieves the *final, refined, metric* 7-DoF pose $P = (R, T, s)$ for Image_N from the TOH. + 3. The system uses the known camera intrinsic matrix $K$. + 4. System retrieves the specific **30m DEM tile** 8 that was fetched by the GAB (Section 3.1) for this region of the map. This DEM is a 3D terrain mesh. +* **Algorithm (Ray-DEM Intersection):** + 1. **Un-project Pixel:** The 2D pixel $(u,v)$ is un-projected into a 3D ray *direction* vector $d_{cam}$ in the camera's local coordinate system: $d_{cam} = K^{-1} \\cdot [u, v, 1]^T$. + 2. **Transform Ray:** This ray direction $d_{cam}$ and origin (0,0,0) are transformed into the *global, metric* coordinate system using the pose $P$. This yields a ray originating at $T$ and traveling in direction $R \\cdot d_{cam}$. + 3. **Intersect:** The system performs a numerical *ray-mesh intersection* 39 to find the 3D point $(X, Y, Z)$ where this global ray *intersects the 3D terrain mesh* of the DEM. + 4. **Result:** This 3D intersection point $(X, Y, Z)$ is the *metric* world coordinate of the object *on the actual terrain*. + 5. **Convert:** This $(X, Y, Z)$ world coordinate is converted to a [Latitude, Longitude, Altitude] GPS coordinate. + +This method correctly accounts for terrain. A pixel aimed at the top of a hill will intersect the DEM at a high Z-value. A pixel aimed at the ravine (Image 1) will intersect at a low Z-value. This is the *only* method that can reliably meet the 20m accuracy (AC-2) for object localization. + +### **7.2 Rigorous Validation Methodology** + +A comprehensive test plan is required. The foundation is a **Ground-Truth Test Harness** using the provided coordinates.csv.42 + +* **Test Harness:** + 1. **Ground-Truth Data:** The file coordinates.csv 42 provides ground-truth [Lat, Lon] for 60 images (e.g., AD000001.jpg...AD000060.jpg). + 2. **Test Datasets:** + * Test_Baseline_60 42: The 60 images and their coordinates. + * Test_Outlier_350m (AC-3): Test_Baseline_60 with a single, unrelated image inserted at frame 30. + * Test_Sharp_Turn_5pct (AC-4): A sequence where frames 20-24 are manually deleted, simulating a \<5% overlap jump. +* **Test Cases:** + * **Test_Accuracy (AC-1, AC-2, AC-5, AC-9):** + * **Run:** Execute GEORTEX-R on Test_Baseline_60, providing AD000001.jpg's coordinate (48.275292, 37.385220) as the Start Coordinate 42 + * **Script:** A validation script will compute the Haversine distance error between the *system's refined GPS output* for each image (2-60) and the *ground-truth GPS* from coordinates.csv. + * **ASSERT** (count(errors \< 50m) / 60) >= 0.80 **(AC-1 Met)** + * **ASSERT** (count(errors \< 20m) / 60) >= 0.60 **(AC-2 Met)** + * **ASSERT** (count(un-localized_images) / 60) \< 0.10 **(AC-5 Met)** + * **ASSERT** (count(localized_images) / 60) > 0.95 **(AC-9 Met)** + * **Test_MRE (AC-10):** + * **Run:** After Test_Baseline_60 completes. + * **ASSERT** TOH.final_Mean_Reprojection_Error \< 1.0 **(AC-10 Met)** + * **Test_Performance (AC-7, AC-8):** + * **Run:** Execute on a 1500-image sequence on the minimum-spec RTX 2060. + * **Log:** Log timestamps for "Image In" -> "Initial Pose Out". + * **ASSERT** average_time \< 5.0s **(AC-7 Met)** + * **Log:** Log the output stream. + * **ASSERT** >80% of images receive *two* poses: an "Initial" and a "Refined" **(AC-8 Met)** + * **Test_Robustness (AC-3, AC-4):** + * **Run:** Execute Test_Outlier_350m. + * **ASSERT** System logs "Stage 2: Discarding Outlier" and the final trajectory error for Image_31 is \< 50m **(AC-3 Met)**. + * **Run:** Execute Test_Sharp_Turn_5pct. + * **ASSERT** System logs "Stage 3: Tracking Lost" and "Relocalization Succeeded," and the final trajectory is complete and accurate **(AC-4 Met)**. + diff --git a/docs/01_solution/04_solution_draft.md b/docs/01_solution/04_solution_draft.md new file mode 100644 index 0000000..7fd9731 --- /dev/null +++ b/docs/01_solution/04_solution_draft.md @@ -0,0 +1,327 @@ +## **The ATLAS-GEOFUSE System Architecture** + +Multi-component architecture designed for high-performance, real-time geolocalization in IMU-denied, high-drift environments. Its architecture is explicitly designed around **pre-flight data caching** and **multi-map robustness**. + +### **2.1 Core Design Principles** + +1. **Pre-Flight Caching:** To meet the <5s (AC-7) real-time requirement, all network latency must be eliminated. The system mandates a "Pre-Flight" step (Section 3.0) where all geospatial data (satellite tiles, DEMs, vector data) for the Area of Interest (AOI) is downloaded from a viable open-source provider (e.g., Copernicus 6) and stored in a local database on the processing laptop. All real-time queries are made against this local cache. +2. **Decoupled Multi-Map SLAM:** The system separates *relative* motion from *absolute* scale. A Visual SLAM (V-SLAM) "Atlas" Front-End (Section 4.0) computes high-frequency, robust, but *unscaled* relative motion. A Local Geospatial Anchoring Back-End (GAB) (Section 5.0) provides sparse, high-confidence, *absolute metric* anchors by querying the local cache. A Trajectory Optimization Hub (TOH) (Section 6.0) fuses these two streams in a Sim(3) pose-graph to solve for the global 7-DoF trajectory (pose + scale). +3. **Multi-Map Robustness (Atlas):** To solve the "sharp turn" (AC-4) and "tracking loss" (AC-6) requirements, the V-SLAM front-end is based on an "Atlas" architecture.14 Tracking loss initiates a *new, independent map fragment*.13 The TOH is responsible for anchoring and merging *all* fragments geodetically 19 into a single, globally-consistent trajectory. + +### **2.2 Component Interaction and Data Flow** + +* **Component 1: Pre-Flight Caching Module (PCM) (Offline)** + * *Input:* User-defined Area of Interest (AOI) (e.g., a KML polygon). + * *Action:* Queries Copernicus 6 and OpenStreetMap APIs. Downloads and builds a local geospatial database (GeoPackage/SpatiaLite) containing satellite tiles, DEM tiles, and road/river vectors for the AOI. + * *Output:* A single, self-contained **Local Geo-Database file**. +* **Component 2: Image Ingestion & Pre-processing (Real-time)** + * *Input:* Image_N (up to 6.2K), Camera Intrinsics ($K$). + * *Action:* Creates two copies: + * **Image_N_LR** (Low-Resolution, e.g., 1536x1024): Dispatched *immediately* to the V-SLAM Front-End. + * **Image_N_HR** (High-Resolution, 6.2K): Stored for asynchronous use by the GAB. +* **Component 3: V-SLAM "Atlas" Front-End (High-Frequency Thread)** + * *Input:* Image_N_LR. + * *Action:* Tracks Image_N_LR against its *active map fragment*. Manages keyframes, local bundle adjustment 38, and the co-visibility graph. If tracking is lost (e.g., AC-4 sharp turn), it initializes a *new map fragment* 14 and continues tracking. + * *Output:* **Relative_Unscaled_Pose** and **Local_Point_Cloud** data, sent to the TOH. +* **Component 4: Local Geospatial Anchoring Back-End (GAB) (Low-Frequency, Asynchronous Thread)** + * *Input:* A keyframe (Image_N_HR) and its *unscaled* pose, triggered by the TOH. + * *Action:* Performs a visual-only, coarse-to-fine search 34 against the *Local Geo-Database*. + * *Output:* An **Absolute_Metric_Anchor** (a high-confidence [Lat, Lon, Alt] pose) for that keyframe, sent to the TOH. +* **Component 5: Trajectory Optimization Hub (TOH) (Central Hub Thread)** + * *Input:* (1) High-frequency Relative_Unscaled_Pose stream. (2) Low-frequency Absolute_Metric_Anchor stream. + * *Action:* Manages the complete flight trajectory as a **Sim(3) pose graph** 39 using Ceres Solver.19 Continuously fuses all data. + * *Output 1 (Real-time):* **Pose_N_Est** (unscaled) sent to UI (meets AC-7, AC-8). + * *Output 2 (Refined):* **Pose_N_Refined** (metric-scale, globally-optimized) sent to UI (meets AC-1, AC-2, AC-8). + +### **2.3 System Inputs** + +1. **Image Sequence:** Consecutively named images (FullHD to 6252x4168). +2. **Start Coordinate (Image 0):** A single, absolute GPS coordinate (Latitude, Longitude). +3. **Camera Intrinsics ($K$):** Pre-calibrated camera intrinsic matrix. +4. **Local Geo-Database File:** The single file generated by the Pre-Flight Caching Module (Section 3.0). + +### **2.4 Streaming Outputs (Meets AC-7, AC-8)** + +1. **Initial Pose ($Pose_N^{Est}$):** An *unscaled* pose estimate. This is sent immediately (<5s, AC-7) to the UI for real-time visualization of the UAV's *path shape*. +2. **Refined Pose ($Pose_N^{Refined}$) [Asynchronous]:** A globally-optimized, *metric-scale* 7-DoF pose (X, Y, Z, Qx, Qy, Qz, Qw) and its corresponding [Lat, Lon, Alt] coordinate. This is sent to the user whenever the TOH re-converges (e.g., after a new GAB anchor or map-merge), updating all past poses (AC-1, AC-2, AC-8 refinement met). + +## **3.0 Pre-Flight Component: The Geospatial Caching Module (PCM)** + +This component is a new, mandatory, pre-flight utility that solves the fatal flaws (Section 1.1, 1.2) of the GEORTEX-R design. It eliminates all real-time network latency (AC-7) and all ToS violations (AC-5), ensuring the project is both performant and legally viable. + +### **3.1 Defining the Area of Interest (AOI)** + +The system is designed for long-range flights. Given 3000 photos at 100m intervals, the maximum linear track is 300km. The user must provide a coarse "bounding box" or polygon (e.g., KML/GeoJSON format) of the intended flight area. The PCM will automatically add a generous buffer (e.g., 20km) to this AOI to account for navigational drift and ensure all necessary reference data is captured. + +### **3.2 Legal & Viable Data Sources (Copernicus & OpenStreetMap)** + +As established in 1.1, the system *must* use open-data providers. The PCM is architected to use the following: + +1. **Visual/Terrain Data (Primary):** The **Copernicus Data Space Ecosystem** 6 is the primary source. The PCM will use the Copernicus Processing and Catalogue APIs 6 to query, process, and download two key products for the buffered AOI: + * **Sentinel-2 Satellite Imagery:** High-resolution (10m) visual tiles. + * **Copernicus GLO-30 DEM:** A 30m-resolution Digital Elevation Model.7 This DEM is *not* used for high-accuracy object localization (see 1.4), but as a coarse altitude *prior* for the TOH and for the critical dynamic-warping step (Section 5.3). +2. **Semantic Data (Secondary):** OpenStreetMap (OSM) data 40 for the AOI will be downloaded. This provides temporally-invariant vector data (roads, rivers, building footprints) which can be used as a secondary, optional verification layer for the GAB, especially in cases of extreme temporal divergence (e.g., new construction).42 + +### **3.3 Building the Local Geo-Database** + +The PCM utility will process all downloaded data into a single, efficient, compressed file. A modern GeoPackage or SpatiaLite database is the ideal format. This database will contain the satellite tiles, DEM tiles, and vector features, all indexed by a common spatial grid (e.g., UTM). + +This single file is then loaded by the main ATLAS-GEOFUSE application at runtime. The GAB's (Section 5.0) "API calls" are thus transformed from high-latency, unreliable HTTP requests 9 into high-speed, zero-latency local SQL queries, guaranteeing that data I/O is never the bottleneck for meeting the AC-7 performance requirement. + +## **4.0 Core Component: The Multi-Map V-SLAM "Atlas" Front-End** + +This component's sole task is to robustly and accurately compute the *unscaled* 6-DoF relative motion of the UAV and build a geometrically-consistent map of keyframes. It is explicitly designed to be more robust than simple frame-to-frame odometry and to handle catastrophic tracking loss (AC-4) gracefully. + +### **4.1 Rationale: ORB-SLAM3 "Atlas" Architecture** + +The system will implement a V-SLAM front-end based on the "Atlas" multi-map paradigm, as seen in SOTA systems like ORB-SLAM3.14 This is the industry-standard solution for robust, long-term navigation in environments where tracking loss is possible.13 + +The mechanism is as follows: + +1. The system initializes and begins tracking on **Map_Fragment_0**, using the known start GPS as a metadata tag. +2. It tracks all new frames (Image_N_LR) against this active map. +3. **If tracking is lost** (e.g., a sharp turn (AC-4) or a persistent 350m outlier (AC-3)): + * The "Atlas" architecture does not fail. It declares Map_Fragment_0 "inactive," stores it, and *immediately initializes* **Map_Fragment_1** from the current frame.14 + * Tracking *resumes instantly* on this new map fragment, ensuring the system "correctly continues the work" (AC-4). + +This architecture converts the "sharp turn" failure case into a *standard operating procedure*. The system never "fails"; it simply fragments. The burden of stitching these fragments together is correctly moved from the V-SLAM front-end (which has no global context) to the TOH (Section 6.0), which *can* solve it using global-metric anchors. + +### **4.2 Feature Matching Sub-System: SuperPoint + LightGlue** + +The V-SLAM front-end's success depends entirely on high-quality feature matches, especially in the sparse, low-texture agricultural terrain seen in the user's images. The selected approach is **SuperPoint + LightGlue**. + +* **SuperPoint:** A SOTA feature detector proven to find robust, repeatable keypoints in challenging, low-texture conditions.43 +* **LightGlue:** A highly optimized GNN-based matcher that is the successor to SuperGlue.44 + +The choice of LightGlue over SuperGlue is a deliberate performance optimization. LightGlue is *adaptive*.46 The user query states sharp turns (AC-4) are "rather an exception." This implies \~95% of image pairs are "easy" (high-overlap, straight flight) and 5% are "hard" (low-overlap, turns). LightGlue's adaptive-depth GNN exits early on "easy" pairs, returning a high-confidence match in a fraction of the time. This saves *enormous* computational budget on the 95% of normal frames, ensuring the system *always* meets the <5s budget (AC-7) and reserving that compute for the GAB and TOH. This component will run on **Image_N_LR** (low-res) to guarantee performance, and will be accelerated via TensorRT (Section 7.0). + +### **4.3 Keyframe Management and Local 3D Cloud** + +The front-end will maintain a co-visibility graph of keyframes for its *active map fragment*. It will perform local Bundle Adjustment 38 continuously over a sliding window of recent keyframes to minimize drift *within* that fragment. + +Crucially, it will triangulate features to create a **local, high-density 3D point cloud** for its map fragment.28 This point cloud is essential for two reasons: + +1. It provides robust tracking (tracking against a 3D map, not just a 2D frame). +2. It serves as the **high-accuracy source** for the object localization output (Section 9.1), as established in 1.4, allowing the system to bypass the high-error external DEM. + +#### **Table 1: Analysis of State-of-the-Art Feature Matchers (For V-SLAM Front-End)** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **SuperPoint + SuperGlue** | - SOTA robustness in low-texture, high-blur conditions. - GNN reasons about 3D scene context. - Proven in real-time SLAM systems. | - Computationally heavy (fixed-depth GNN). - Slower than LightGlue. | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT. | **Good.** A solid, baseline choice. Meets robustness needs but will heavily tax the <5s time budget (AC-7). | +| **SuperPoint + LightGlue** 44 | - **Adaptive Depth:** Faster on "easy" pairs, more accurate on "hard" pairs.46 - **Faster & Lighter:** Outperforms SuperGlue on speed and accuracy. - SOTA "in practice" choice for large-scale matching. | - Newer, but rapidly being adopted and proven.48 | - NVIDIA GPU (RTX 2060+). - PyTorch or TensorRT. | **Excellent (Selected).** The adaptive nature is *perfect* for this problem. It saves compute on the 95% of easy (straight) frames, maximizing our ability to meet AC-7. | + +## **5.0 Core Component: The Local Geospatial Anchoring Back-End (GAB)** + +This asynchronous component is the system's "anchor to reality." Its sole purpose is to find a high-confidence, *absolute-metric* pose for a given V-SLAM keyframe by matching it against the **local, pre-cached geo-database** (from Section 3.0). This component is a full replacement for the high-risk, high-latency GAB from the GEORTEX-R draft (see 1.2, 1.5). + +### **5.1 Rationale: Local-First Query vs. On-Demand API** + +As established in 1.2, all queries are made to the local SSD. This guarantees zero-latency I/O, which is a hard requirement for a real-time system, as external network latency is unacceptably high and variable.9 The GAB itself runs asynchronously and can take longer than 5s (e.g., 10-15s), but it must not be *blocked* by network I/O, which would stall the entire processing pipeline. + +### **5.2 SOTA Visual-Only Coarse-to-Fine Localization** + +This component implements a state-of-the-art, two-stage *visual-only* pipeline, which is lower-risk and more performant (see 1.5) than the GEORTEX-R's semantic-hybrid model. This approach is well-supported by SOTA research in aerial localization.34 + +1. **Stage 1 (Coarse): Global Descriptor Retrieval.** + * *Action:* When the TOH requests an anchor for Keyframe_k, the GAB first computes a *global descriptor* (a compact vector representation) for the *nadir-warped* (see 5.3) low-resolution Image_k_LR. + * *Technology:* A SOTA Visual Place Recognition (VPR) model like **SALAD** 49, **TransVLAD** 50, or **NetVLAD** 33 will be used. These are designed for this "image retrieval" task.45 + * *Result:* This descriptor is used to perform a fast FAISS/vector search against the descriptors of the *local satellite tiles* (which were pre-computed and stored in the Geo-Database). This returns the Top-K (e.g., K=5) most likely satellite tiles in milliseconds. +2. **Stage 2 (Fine): Local Feature Matching.** + * *Action:* The system runs **SuperPoint+LightGlue** 43 to find pixel-level correspondences. + * *Performance:* This is *not* run on the *full* UAV image against the *full* satellite map. It is run *only* between high-resolution patches (from **Image_k_HR**) and the **Top-K satellite tiles** identified in Stage 1. + * *Result:* This produces a set of 2D-2D (image-to-map) feature matches. A PnP/RANSAC solver then computes a high-confidence 6-DoF pose. This pose is the **Absolute_Metric_Anchor** that is sent to the TOH. + +### **5.3 Solving the Viewpoint Gap: Dynamic Feature Warping** + +The GAB must solve the "viewpoint gap" 33: the UAV image is oblique (due to roll/pitch), while the satellite tiles are nadir (top-down). + +The GEORTEX-R draft proposed a complex, high-risk deep learning solution. The ATLAS-GEOFUSE solution is far more elegant and requires zero R\&D: + +1. The V-SLAM Front-End (Section 4.0) already *knows* the camera's *relative* 6-DoF pose, including its **roll and pitch** orientation relative to the *local map's ground plane*. +2. The *Local Geo-Database* (Section 3.0) contains a 30m-resolution DEM for the AOI. +3. When the GAB processes Keyframe_k, it *first* performs a **dynamic homography warp**. It projects the V-SLAM ground plane onto the coarse DEM, and then uses the known camera roll/pitch to calculate the perspective transform (homography) needed to *un-distort* the oblique UAV image into a synthetic *nadir-view*. + +This *nadir-warped* UAV image is then used in the Coarse-to-Fine pipeline (5.2). It will now match the *nadir* satellite tiles with extremely high-fidelity. This method *eliminates* the viewpoint gap *without* training any new neural networks, leveraging the inherent synergy between the V-SLAM component and the GAB's pre-cached DEM. + +## **6.0 Core Component: The Multi-Map Trajectory Optimization Hub (TOH)** + +This component is the system's central "brain." It runs continuously, fusing all measurements (high-frequency/unscaled V-SLAM, low-frequency/metric-scale GAB anchors) from *all map fragments* into a single, globally consistent trajectory. + +### **6.1 Incremental Sim(3) Pose-Graph Optimization** + +The central challenge of monocular, IMU-denied SLAM is scale-drift. The V-SLAM front-end produces *unscaled* 6-DoF ($SE(3)$) relative poses.37 The GAB produces *metric-scale* 6-DoF ($SE(3)$) *absolute* poses. These cannot be directly combined. + +The solution is that the graph *must* be optimized in **Sim(3) (7-DoF)**.39 This adds a *single global scale factor $s$* as an optimizable variable to each V-SLAM map fragment. The TOH will maintain a pose-graph using **Ceres Solver** 19, a SOTA optimization library. + +The graph is constructed as follows: + +1. **Nodes:** Each keyframe pose (7-DoF: $X, Y, Z, Qx, Qy, Qz, s$). +2. **Edge 1 (V-SLAM):** A relative pose constraint between Keyframe_i and Keyframe_j *within the same map fragment*. The error is computed in Sim(3).29 +3. **Edge 2 (GAB):** An *absolute* pose constraint on Keyframe_k. This constraint *fixes* Keyframe_k's pose to the *metric* GPS coordinate from the GAB anchor and *fixes its scale $s$ to 1.0*. + +The GAB's $s=1.0$ anchor creates "tension" in the graph. The Ceres optimizer 20 resolves this tension by finding the *one* global scale $s$ for all *other* V-SLAM nodes in that fragment that minimizes the total error. This effectively "stretches" or "shrinks" the entire unscaled V-SLAM fragment to fit the metric anchors, which is the core of monocular SLAM scale-drift correction.29 + +### **6.2 Geodetic Map-Merging via Absolute Anchors** + +This is the robust solution to the "sharp turn" (AC-4) problem, replacing the flawed "relocalization" model from the original draft. + +* **Scenario:** The UAV makes a sharp turn (AC-4). The V-SLAM front-end *loses tracking* on Map_Fragment_0 and *creates* Map_Fragment_1 (per Section 4.1). The TOH's pose graph now contains *two disconnected components*. +* **Mechanism (Geodetic Merging):** + 1. The GAB (Section 5.0) is *queued* to find anchors for keyframes in *both* fragments. + 2. The GAB returns Anchor_A for Keyframe_10 (in Map_Fragment_0) with GPS [Lat_A, Lon_A]. + 3. The GAB returns Anchor_B for Keyframe_50 (in Map_Fragment_1) with GPS ``. + 4. The TOH adds *both* of these as absolute, metric constraints (Edge 2) to the global pose-graph. +* The graph optimizer 20 now has all the information it needs. It will solve for the 7-DoF pose of *both fragments*, placing them in their correct, globally-consistent metric positions. The two fragments are *merged geodetically* (i.e., by their global coordinates) even if they *never* visually overlap. This is a vastly more robust and modern solution than simple visual loop closure.19 + +### **6.3 Automatic Outlier Rejection (AC-3, AC-5)** + +The system must be robust to 350m outliers (AC-3) and <10% bad GAB matches (AC-5). A standard least-squares optimizer (like Ceres 20) would be catastrophically corrupted by a 350m error. + +This is a solved problem in modern graph optimization.19 The solution is to wrap *all* constraints (V-SLAM and GAB) in a **Robust Loss Function (e.g., HuberLoss, CauchyLoss)** within Ceres Solver. + +A robust loss function mathematically *down-weights* the influence of constraints with large errors (high residuals). When the TOH "sees" the 350m error from a V-SLAM relative pose (AC-3) or a bad GAB anchor (AC-5), the robust loss function effectively acknowledges the measurement but *refuses* to pull the entire 3000-image trajectory to fit this one "insane" data point. It automatically and gracefully *ignores* the outlier, optimizing the 99.9% of "sane" measurements, thus meeting AC-3 and AC-5. + +### **Table 2: Analysis of Trajectory Optimization Strategies** + +| Approach (Tools/Library) | Advantages | Limitations | Requirements | Fitness for Problem Component | +| :---- | :---- | :---- | :---- | :---- | +| **Incremental SLAM (Pose-Graph Optimization)** (Ceres Solver 19, g2o, GTSAM) | - **Real-time / Online:** Provides immediate pose estimates (AC-7). - **Supports Refinement:** Explicitly designed to refine past poses when new "loop closure" (GAB) data arrives (AC-8). - **Robust:** Can handle outliers via robust kernels.19 | - Initial estimate is *unscaled* until a GAB anchor arrives. - Can drift *if* not anchored. | - A graph optimization library (Ceres). - A robust cost function (Huber). | **Excellent (Selected).** This is the *only* architecture that satisfies all user requirements for real-time streaming (AC-7) and asynchronous refinement (AC-8). | +| **Batch Structure from Motion (Global Bundle Adjustment)** (COLMAP, Agisoft Metashape) | - **Globally Optimal Accuracy:** Produces the most accurate possible 3D reconstruction. | - **Offline:** Cannot run in real-time or stream results. - High computational cost (minutes to hours). - Fails AC-7 and AC-8 completely. | - All images must be available before processing starts. - High RAM and CPU. | **Good (as an *Optional* Post-Processing Step).** Unsuitable as the primary online system, but could be offered as an optional, high-accuracy "Finalize Trajectory" batch process. | + +## **7.0 High-Performance Compute & Deployment** + +The system must run on an RTX 2060 (AC-7) while processing 6.2K images. These are opposing constraints that require a deliberate compute strategy to balance speed and accuracy. + +### **7.1 Multi-Scale, Coarse-to-Fine Processing Pipeline** + +The system must balance the conflicting demands of real-time speed (AC-7) and high accuracy (AC-2). This is achieved by running different components at different resolutions. + +* **V-SLAM Front-End (Real-time, <5s):** This component (Section 4.0) runs *only* on the **Image_N_LR** (e.g., 1536x1024) copy. This is fast enough to meet the AC-7 budget.46 +* **GAB (Asynchronous, High-Accuracy):** This component (Section 5.0) uses the full-resolution **Image_N_HR** *selectively* to meet the 20m accuracy (AC-2). + 1. Stage 1 (Coarse) runs on the low-res, nadir-warped image. + 2. Stage 2 (Fine) runs SuperPoint on the *full 6.2K* image to find the *most confident* keypoints. It then extracts small, 256x256 *patches* from the *full-resolution* image, centered on these keypoints. + 3. It matches *these small, full-resolution patches* against the high-res satellite tile. + +This hybrid, multi-scale method provides the fine-grained matching accuracy of the 6.2K image (needed for AC-2) without the catastrophic CUDA Out-of-Memory errors (an RTX 2060 has only 6GB VRAM 30) or performance penalties that full-resolution processing would entail. + +### **7.2 Mandatory Deployment: NVIDIA TensorRT Acceleration** + +The deep learning models (SuperPoint, LightGlue, NetVLAD) will be too slow in their native PyTorch framework to meet AC-7 on an RTX 2060. + +This is not an "optional" optimization; it is a *mandatory* deployment step. The key neural networks *must* be converted from PyTorch into a highly-optimized **NVIDIA TensorRT engine**. + +Research *specifically* on accelerating LightGlue with TensorRT shows **"2x-4x speed gains over compiled PyTorch"**.48 Other benchmarks confirm TensorRT provides 30-70% speedups for deep learning inference.52 This conversion (which applies layer fusion, graph optimization, and FP16/INT8 precision) is what makes achieving the <5s (AC-7) performance *possible* on the specified RTX 2060 hardware. + +## **8.0 System Robustness: Failure Mode Escalation Logic** + +This logic defines the system's behavior during real-world failures, ensuring it meets criteria AC-3, AC-4, AC-6, and AC-9, and is built upon the new "Atlas" multi-map architecture. + +### **8.1 Stage 1: Normal Operation (Tracking)** + +* **Condition:** V-SLAM front-end (Section 4.0) is healthy. +* **Logic:** + 1. V-SLAM successfully tracks Image_N_LR against its *active map fragment*. + 2. A new **Relative_Unscaled_Pose** is sent to the TOH (Section 6.0). + 3. TOH sends **Pose_N_Est** (unscaled) to the user (AC-7, AC-8 met). + 4. If Image_N is selected as a keyframe, the GAB (Section 5.0) is *queued* to find an anchor for it, which will trigger a **Pose_N_Refined** update later. + +### **8.2 Stage 2: Transient VO Failure (Outlier Rejection)** + +* **Condition:** Image_N is unusable (e.g., severe blur, sun-glare, or the 350m outlier from AC-3). +* **Logic (Frame Skipping):** + 1. V-SLAM front-end fails to track Image_N_LR against the active map. + 2. The system *discards* Image_N (marking it as a rejected outlier, AC-5). + 3. When Image_N+1 arrives, the V-SLAM front-end attempts to track it against the *same* local keyframe map (from Image_N-1). + 4. **If successful:** Tracking resumes. Image_N is officially an outlier. The system "correctly continues the work" (AC-3 met). + 5. **If fails:** The system repeats for Image_N+2, N+3. If this fails for \~5 consecutive frames, it escalates to Stage 3. + +### **8.3 Stage 3: Persistent VO Failure (New Map Initialization)** + +* **Condition:** Tracking is lost for multiple frames. This is the **"sharp turn" (AC-4)** or "low overlap" (AC-4) scenario. +* **Logic (Atlas Multi-Map):** + 1. The V-SLAM front-end (Section 4.0) declares "Tracking Lost." + 2. It marks the current Map_Fragment_k as "inactive".13 + 3. It *immediately* initializes a **new** Map_Fragment_k+1 using the current frame (Image_N+5). + 4. **Tracking resumes instantly** on this new, unscaled, un-anchored map fragment. + 5. This "registering" of a new map ensures the system "correctly continues the work" (AC-4 met) and maintains the >95% registration rate (AC-9) by not counting this as a failure. + +### **8.4 Stage 4: Map-Merging & Global Relocalization (GAB-Assisted)** + +* **Condition:** The system is now tracking on Map_Fragment_k+1, while Map_Fragment_k is inactive. The TOH pose-graph (Section 6.0) is disconnected. +* **Logic (Geodetic Merging):** + 1. The TOH queues the GAB (Section 5.0) to find anchors for *both* map fragments. + 2. The GAB finds anchors for keyframes in *both* fragments. + 3. The TOH (Section 6.2) receives these metric anchors, adds them to the graph, and the Ceres optimizer 20 *finds the global 7-DoF pose for both fragments*, merging them into a single, metrically-consistent trajectory. + +### **8.5 Stage 5: Catastrophic Failure (User Intervention)** + +* **Condition:** The system is in Stage 3 (Lost), *and* the GAB (Section 5.0) has *also* failed to find *any* global anchors for a new Map_Fragment_k+1 for a prolonged period (e.g., 20% of the route). This is the "absolutely incapable" scenario (AC-6), (e.g., flying over a large, featureless body of water or dense, uniform fog). +* **Logic:** + 1. The system has an *unscaled, un-anchored* map fragment (Map_Fragment_k+1) and *zero* idea where it is in the world. + 2. The TOH triggers the AC-6 flag. +* **Resolution (User-Aided Prior):** + 1. The UI prompts the user: "Tracking lost. Please provide a coarse location for the *current* image." + 2. The user clicks *one point* on a map. + 3. This [Lat, Lon] is *not* taken as ground truth. It is fed to the **GAB (Section 5.0)** as a *strong spatial prior* for its *local database query* (Section 5.2). + 4. This narrows the GAB's Stage 1 search area from "the entire AOI" to "a 5km radius around the user's click." This *guarantees* the GAB will find the correct satellite tile, find a high-confidence **Absolute_Metric_Anchor**, and allow the TOH (Stage 4) to re-scale 29 and geodetically-merge 20 this lost fragment, re-localizing the entire trajectory. + +## **9.0 High-Accuracy Output Generation and Validation Strategy** + +This section details how the final user-facing outputs are generated, specifically replacing the flawed "Ray-DEM" method (see 1.4) with a high-accuracy "Ray-Cloud" method to meet the 20m accuracy (AC-2). + +### **9.1 High-Accuracy Object Geolocalization via Ray-Cloud Intersection** + +As established in 1.4, using an external 30m DEM 21 for object localization introduces uncontrollable errors (up to 4m+22) that make meeting the 20m (AC-2) accuracy goal impossible. The system *must* use its *own*, internally-generated 3D map, which is locally far more accurate.25 + +* **Inputs:** + 1. User clicks pixel coordinate $(u,v)$ on Image_N. + 2. The system retrieves the **final, refined, metric 7-DoF Sim(3) pose** $P_{sim(3)} = (s, R, T)$ for the *map fragment* that Image_N belongs to. This transform $P_{sim(3)}$ maps the *local V-SLAM coordinate system* to the *global metric coordinate system*. + 3. The system retrieves the *local, unscaled* **V-SLAM 3D point cloud** ($P_{local_cloud}$) generated by the Front-End (Section 4.3). + 4. The known camera intrinsic matrix $K$. +* **Algorithm (Ray-Cloud Intersection):** + 1. **Un-project Pixel:** The 2D pixel $(u,v)$ is un-projected into a 3D ray *direction* vector $d_{cam}$ in the camera's local coordinate system: $d_{cam} = K^{-1} \\cdot [u, v, 1]^T$. + 2. **Transform Ray (Local):** This ray is transformed using the *local V-SLAM pose* of Image_N to get a ray in the *local map fragment's* coordinate system. + 3. **Intersect (Local):** The system performs a numerical *ray-mesh intersection* (or nearest-neighbor search) to find the 3D point $P_{local}$ where this local ray *intersects the local V-SLAM point cloud* ($P_{local_cloud}$).25 This $P_{local}$ is *highly accurate* relative to the V-SLAM map.26 + 4. **Transform (Global):** This local 3D point $P_{local}$ is now transformed to the global, metric coordinate system using the 7-DoF Sim(3) transform from the TOH: $P_{metric} = s \\cdot (R \\cdot P_{local}) + T$. + 5. **Result:** This 3D intersection point $P_{metric}$ is the *metric* world coordinate of the object. + 6. **Convert:** This $(X, Y, Z)$ world coordinate is converted to a [Latitude, Longitude, Altitude] GPS coordinate.55 + +This method correctly isolates the error. The object's accuracy is now *only* dependent on the V-SLAM's geometric fidelity (AC-10 MRE < 1.0px) and the GAB's global anchoring (AC-1, AC-2). It *completely eliminates* the external 30m DEM error 22 from this critical, high-accuracy calculation. + +### **9.2 Rigorous Validation Methodology** + +A comprehensive test plan is required to validate compliance with all 10 Acceptance Criteria. The foundation is a **Ground-Truth Test Harness** (e.g., using the provided coordinates.csv data). + +* **Test Harness:** + 1. **Ground-Truth Data:** coordinates.csv provides ground-truth [Lat, Lon] for a set of images. + 2. **Test Datasets:** + * Test_Baseline: The ground-truth images and coordinates. + * Test_Outlier_350m (AC-3): Test_Baseline with a single, unrelated image inserted. + * Test_Sharp_Turn_5pct (AC-4): A sequence where several frames are manually deleted to simulate <5% overlap. + * Test_Long_Route (AC-9): A 1500-image sequence. +* **Test Cases:** + * **Test_Accuracy (AC-1, AC-2, AC-5, AC-9):** + * **Run:** Execute ATLAS-GEOFUSE on Test_Baseline, providing the first image's coordinate as the Start Coordinate. + * **Script:** A validation script will compute the Haversine distance error between the *system's refined GPS output* ($Pose_N^{Refined}$) for each image and the *ground-truth GPS*. + * **ASSERT** (count(errors < 50m) / total_images) >= 0.80 **(AC-1 Met)** + * **ASSERT** (count(errors < 20m) / total_images) >= 0.60 **(AC-2 Met)** + * **ASSERT** (count(un-localized_images) / total_images) < 0.10 **(AC-5 Met)** + * **ASSERT** (count(localized_images) / total_images) > 0.95 **(AC-9 Met)** + * **Test_MRE (AC-10):** + * **Run:** After Test_Baseline completes. + * **ASSERT** TOH.final_Mean_Reprojection_Error < 1.0 **(AC-10 Met)** + * **Test_Performance (AC-7, AC-8):** + * **Run:** Execute on Test_Long_Route on the minimum-spec RTX 2060. + * **Log:** Log timestamps for "Image In" -> "Initial Pose Out" ($Pose_N^{Est}$). + * **ASSERT** average_time < 5.0s **(AC-7 Met)** + * **Log:** Log the output stream. + * **ASSERT** >80% of images receive *two* poses: an "Initial" and a "Refined" **(AC-8 Met)** + * **Test_Robustness (AC-3, AC-4, AC-6):** + * **Run:** Execute Test_Outlier_350m. + * **ASSERT** System logs "Stage 2: Discarding Outlier" or "Stage 3: New Map" *and* the final trajectory error for the *next* frame is < 50m **(AC-3 Met)**. + * **Run:** Execute Test_Sharp_Turn_5pct. + * **ASSERT** System logs "Stage 3: New Map Initialization" and "Stage 4: Geodetic Map-Merge," and the final trajectory is complete and accurate **(AC-4 Met)**. + * **Run:** Execute on a sequence with no GAB anchors possible for 20% of the route. + * **ASSERT** System logs "Stage 5: User Intervention Requested" **(AC-6 Met)**. + diff --git a/docs/_metodology/01_research_phase.md b/docs/_metodology/01_research_phase.md index 99e4af0..e39346b 100644 --- a/docs/_metodology/01_research_phase.md +++ b/docs/_metodology/01_research_phase.md @@ -2,29 +2,30 @@ ## 1.1 **👨‍💻Developers**: Problem statement - Discuss the problem and create in the `docs/00_problem` next files: - - `problem_description.md`: Our problem to solve with the end result we want to achieve. For example: + Discuss the problem and create in the `docs/00_problem` next files and folders: + - `problem_description.md`: Our problem to solve with the end result we want to achieve. + - `input_data`: Put to this folder all the necessary input data and expected results for the further tests. Analyze very thoroughly input data and form system's restrictions and acceptance ctiteria + - `restrictions.md`: Restrictions we have in real world in the -dashed list format. + - `acceptance_criteria.md`: Acceptance criteria for the solution in the -dashed list format. + The most important part, determines how good the system should be. + +### Example: + - `problem_description.md` We have wing type UAV (airplane). It should fly autonomously to predetermined GPS destination. During the flight it is relying on the signal form GPS module. But when adversary jam or spoof GPS, then UAV either don't know where to fly, or fly to the wrong direction. So, we need to achieve that UAV can fly correctly to the destination without GPS or when GPS is spoofed. We can use the camera pointing downward and other sensor data like altitude, available form the flight controller. Airplane is running Ardupliot. - - - `docs/00_problem/input_data`: Put to this folder all the necessary input data and expected results for the further tests. For example: + - `input_data` - orthophoto images from the UAV for the analysis - list of expected GPS for the centers for each picture in csv format: picture name, lat, lon - video from the UAV for the analysis - list of expected GPS for the centers of video per timeframe in csv format: timestamp, lat, lon for each 1-2 seconds - ... - - Analyze very thoroughly input data and form system's restrictions and acceptance ctiteria - - - `restrictions.md`: Restrictions we have in real world in the -dashed list format. For example: + - `restrictions.md` - We're limiting our solution to airplane type UAVs. - Additional weight it could take is under 1 kg. - The whole system should cost under $2000. - The flying range is restricted by eastern and southern part of Ukraine. And so on. - - - `acceptance_criteria.md`: Acceptance criteria for the solution in the -dashed list format. - The most important part, determines how good the system should be. For example: + - `acceptance_criteria.md` - UAV should fly without GPS for at least 30 km in the sunshine weather. - UAV shoulf fly with maximum mistake no more than 40 meters from the real GPS - UAV should fly correctly with little foggy weather with maximum mistake no more than 100 meters from the real GPS @@ -32,11 +33,12 @@ ## 1.2 **✨AI Research**: Restrictions and Acceptance Criteria assesment - In the new context form the next prompt: - - Add *.md to the context - - Add sample files to the prompt - - Put proper sample filenames in the text of the prompt -and run it in DeepResearch tool (Gemini, DeepSeek, or other) + Put to the research context: + - `problem_description.md` + - `restrictions.md` + - `acceptance_criteria.md` + - Samples of the input data + Run it in DeepResearch tool (Gemini, DeepSeek, or other) ``` We have the problem described in `problem_description.md`. - System should process data samples in the attached files (if any). They are for reference only. @@ -50,87 +52,76 @@ We have the problem described in `problem_description.md`. - Our values - Your researched criterion values - Status: Is the criterion added by your research to our system, modified, or removed - Assess the restrictions we've put on the system. Are they realistic? Propose corrections in the next table: + Assess the restrictions we've put on the system. Are they realistic? Should we add more strict restrictions, or vise versa, add more requirements in restrictions to use our system. Propose corrections in the next table: - Restriction name - Our values - Your researched restriction values - Status: Is a restriction added by your research to our system, modified, or removed ``` - - **👨‍💻Developers**: Revise the result, discuss them and overwrite `docs/00_problem/acceptance_criteria.md` + **👨‍💻Developers**: Revise the result, discuss them and overwrite `acceptance_criteria.md` and `restrictions.md` ## 1.3 **✨AI Research**: Research the problem in great detail - In the new context form the next prompt: - - Add *.md to the context - - Add sample files to the prompt - - Put proper sample filenames in the text of the prompt - and run it in DeepResearch tool (Gemini, DeepSeek, or other) + Replace md files with actual data + Put to the research context samples of the input data + Run it in DeepResearch tool (Gemini, DeepSeek, or other) ``` - Research this problem in `problem_description.md`. - - The system should process data samples in attached files (if any). They are for reference only. - - We have the next restrictions for the input data in `restrictions.md`. - - Output of our system should meet these acceptance criteria in `acceptance_criteria.md`. + Research this problem: + + `problem_description.md` + + The system should process data samples in the attached files (if any). They are for reference only. + The system has next restrictions and conditions: + + `restrictions.md` + + - Output of our system should meet these acceptance criteria: + + `acceptance_criteria.md` + - Find out all the state-of-the-art solutions for this problem and produce the resulting solution draft in the next format: - Short Product solution description. Brief component interaction diagram. - - Architecture approach that meets restrictions and acceptance criteria. Tables to compare possible component decisions. - - Testing strategy. Research the best approaches to cover all the acceptance criteria, functional, and non-functional tests - Be concise in formulating, The less words the better, but do not miss any important details. -``` + - Architecture approach that meets restrictions and acceptance criteria. For each component, analyze the best possible approaches to solve, and form a table comprising all approaches. Each new approach would be a row, and has the next columns: + - Tools (library, platform) to solve component tasks + - Advantages of this approach + - Limitations of this approach + - Requirements for this approach + - How does it fit for the problem component that has to be solved, and the whole solution + - Testing strategy. Research the best approaches to cover all the acceptance criteria. Form a list of integration functional tests and non-functional tests. +Be concise in formulating. The fewer words, the better, but do not miss any important details. +``` **👨‍💻Developer**: Revise the result from AI. Research the problem as well, and add/modify/remove some solution details in the draft. - Store it to the `docs/01_solution/solution_draft_01.md` + Store it to the `docs/01_solution/solution_draft.md` ## 1.4 **✨AI Research**: Solution draft assessment - Add *.md to the context and run it in DeepResearch tool (Gemini, DeepSeek, or other) +Replace md files with actual data +Run it in DeepResearch tool (Gemini, DeepSeek, or other) ``` - We have a problem here in `problem_description.md` with restrictions `restrictions.md`. We've presented the solution here in `solution_draft_xx.md`. - Identify all potential weak points and problems. Address them and find out ways to solve them. Based on your findings, form a new solution draft in the same format without mentioning or comparison with first solution draft. Also, in case of replacing some components with better one, do not just throw away the previous, form a table with comparison for the component. -``` +Read carefully about the problem: - **👨‍💻Developer**: Research by yourself as well - how to solve additional problems which AI figured out, and add them to the result. Store the result draft to the `docs/01_solution/solution_draft_xx+1.md`, and repeat the process. When the next solution wouldn't differ much from the previous one, store the last draft as `docs/01_solution/solution.md` - - -## 1.5 **🤖📋AI plan**: Solution Decomposition - ``` - Decompose the solution `@docs/01_solution/solution.md` to the components. - Store description of each component to the file `docs/02_components/[##]_[component_name]/spec.md` with the next structure: - - Component Name - - Detailed description - - API methods, for each method: - - Name - - Input - - Output - - Description - - Test cases for the method - - Integration tests for the component if needed. + `problem_description.md` - Generate draw.io components diagram shows relations between components. - Do not put any code yet, only names, input and output. + System has next restrictions and conditions: - Also read `@docs/00_initial/acceptance_criteria.md` and compose tests according to test strategy to cover all the criteria and store them to the files - `docs/03_tests/[##]_[test_name]_spec.md` with the next structure: - - Summary - - Detailed description - - Preconditions for tests - - Steps: - - Step1 - Expected result1 - - Step2 - Expected result2 - ... - - StepN - Expected resultN - - Do not put any code yet. Ask as many questions as needed. -``` - **👨‍💻Developer**: Answer the questions AI asked, put as many details as possible + `restrictions.md` + Output of the system should address next acceptance criteria: + + `acceptance_criteria.md` + + Here is a solution draft: + + `solution_draft.md` - ## 1.6 **🤖📋AI plan**: Business Requirement generation - ``` - From the initial requirements above generate Jira epics: - - Ask the Jira project key and latest created jira task number. For example AZ-412 - - Access the solution `@docs/01_solution/solution.md` and description of each component in the files `docs/02_components/[##]_[component_name]/spec.md` - - Generate Jira Epics from the Components - - Ensure each epic has clear goal and acceptance criteria, verify the criteria with `@docs/00_initial/acceptance_criteria.md` - - Generate draw.io components diagram based on previous diagram shows relations between components and Jira Epic numbers corresponding to each component. -``` \ No newline at end of file +Identify all potential weak points and problems. Address them and find out ways to solve them. Based on your findings, form a new solution draft in the same format. + +If your finding requires a complete reorganization of the flow and different components, state it. +Put all the findings regarding what was weak and poor at the beginning of the report. Put here all new findings, what was updated, replaced, or removed from the previous solution. + +Then form a new solution design without referencing the previous system. Remove Poor and Very Poor component choices from the component analysis tables, but leave Good and Excellent ones. +In the updated report, do not put "new" marks, do not compare to the previous solution draft, just make a new solution as if from scratch + +``` + **👨‍💻Developer**: Research by yourself as well - how to solve additional problems which AI figured out, and add them to the result. Rename previous `solution_draft.md` to `xx_solution_draft.md`. And then store the result draft to the `docs/01_solution/solution_draft.md`, and repeat the process. When the next solution wouldn't differ much from the previous one, store the last draft as `docs/01_solution/solution.md` diff --git a/docs/_metodology/02_planning_phase.md b/docs/_metodology/02_planning_phase.md new file mode 100644 index 0000000..1e6e376 --- /dev/null +++ b/docs/_metodology/02_planning_phase.md @@ -0,0 +1,5 @@ +# 2. Planning phase + +## 2.1 **🤖📋AI plan** /gen_components +## 2.2 **🤖📋AI plan** /gen_tests +## 2.3 **🤖📋AI plan** /gen_epics \ No newline at end of file diff --git a/docs/_metodology/02_development_phase.md b/docs/_metodology/03_development_phase.md similarity index 91% rename from docs/_metodology/02_development_phase.md rename to docs/_metodology/03_development_phase.md index a22bbc6..56b6164 100644 --- a/docs/_metodology/02_development_phase.md +++ b/docs/_metodology/03_development_phase.md @@ -1,6 +1,6 @@ -# 2. Development phase +# 3. Development phase -## 2.1 **🤖📋AI plan**: Component Decomposition +## 3.1 **🤖📋AI plan**: Component Decomposition For each component in `docs/02_components` do next: ``` Decompose `@docs/02_components/[##]_[component_name]/spec.md` to the features. If component is simple enough, make only 1 feature, if complex - separate per features. Feature can contain 0 or more APIs. Create `docs/02_components/[##]_[component_name]/[##]_[feature_name]_feature.md` with the next structure: @@ -17,7 +17,7 @@ ``` **👨‍💻Developer**: Answer the questions AI asked, put as many details as possible -## 2.2 **🤖AI agent**: Feature implementation +## 3.2 **🤖AI agent**: Feature implementation For each component in `docs/02_components/[##]_[component_name]/` folder do next: ``` Read component description `@docs/02_components/[##]_[component_name]/spec.md`. @@ -29,7 +29,7 @@ If integration tests are specified in component spec, then write them and run, and make sure that component working correctly ``` -## 2.3 **🤖AI agent**: Solution composition and integration tests +## 3.3 **🤖AI agent**: Solution composition and integration tests ``` Read all the files here `docs/03_tests/` and for each file write down tests and run it. Compose a final test results in a csv with the next format: diff --git a/docs/_metodology/03_refactoring_phase.md b/docs/_metodology/04_refactoring_phase.md similarity index 100% rename from docs/_metodology/03_refactoring_phase.md rename to docs/_metodology/04_refactoring_phase.md