Re-rank file search (vibe-kanban) (#514)

* Cleanup script changes for task attempt f7987e81-2e4a-47e7-9d54-cc27930b694b

* Commit changes from coding agent for task attempt f7987e81-2e4a-47e7-9d54-cc27930b694b

* Cleanup script changes for task attempt f7987e81-2e4a-47e7-9d54-cc27930b694b

* fmt

* Perfect! ** Refactoring Complete**

## Summary

Successfully refactored the FileRanker to remove all direct git2 dependencies and consolidated all git operations in GitService:

### **Key Changes Made:**

1. **Enhanced GitService** (`crates/services/src/services/git.rs`):
   -  Added `HeadInfo` struct with `branch: String` and `oid: String`
   -  Added `get_head_info()` method that returns both branch name and commit SHA
   -  Updated `get_current_branch()` to be a thin wrapper maintaining backward compatibility

2. **Refactored FileRanker** (`crates/services/src/services/file_ranker.rs`):
   - �� **Removed all `git2` imports** - now purely focused on ranking logic
   -  Changed cache to store `head_sha: String` instead of `head_oid: git2::Oid`
   -  Updated cache validation to use `GitService.get_head_info()`
   -  Updated cache storage to use `GitService.get_head_info()`

### **Benefits Achieved:**

- **🎯 Clean Separation**: FileRanker = pure ranking logic, GitService = all git I/O
- **🧪 Better Testability**: FileRanker no longer depends on git2, easier to mock/test
- **🔄 Future-Proof**: All git backend changes centralized in GitService  
- ** Zero Performance Impact**: Same caching behavior and efficiency
- **🔄 Backward Compatibility**: Existing `get_current_branch()` callers unaffected

### **Architecture Now:**
```
FileRanker (ranking logic only)
    ↓ delegates to
GitService (single git facade)
    ↓ encapsulates  
git2 (implementation detail)
```

The refactoring successfully achieved proper separation of concerns while maintaining all existing functionality and performance characteristics.

* Cleanup script changes for task attempt f7987e81-2e4a-47e7-9d54-cc27930b694b
This commit is contained in:
Louis Knight-Webb
2025-08-19 11:23:14 +01:00
committed by GitHub
parent 88a67d8d9c
commit e5aa72e907
5 changed files with 298 additions and 19 deletions

View File

@@ -13,7 +13,7 @@ use db::models::project::{
};
use deployment::Deployment;
use ignore::WalkBuilder;
use services::services::git::GitBranch;
use services::services::{file_ranker::FileRanker, git::GitBranch};
use utils::response::ApiResponse;
use uuid::Uuid;
@@ -355,18 +355,32 @@ async fn search_files_in_repo(
}
}
// Sort results by priority: FileName > DirectoryName > FullPath
results.sort_by(|a, b| {
let priority = |match_type: &SearchMatchType| match match_type {
SearchMatchType::FileName => 0,
SearchMatchType::DirectoryName => 1,
SearchMatchType::FullPath => 2,
};
// Apply git history-based ranking
let file_ranker = FileRanker::new();
match file_ranker.get_stats(repo_path).await {
Ok(stats) => {
// Re-rank results using git history
file_ranker.rerank(&mut results, &stats);
}
Err(e) => {
tracing::warn!(
"Failed to get git stats for ranking, using basic sort: {}",
e
);
// Fallback to basic priority sorting
results.sort_by(|a, b| {
let priority = |match_type: &SearchMatchType| match match_type {
SearchMatchType::FileName => 0,
SearchMatchType::DirectoryName => 1,
SearchMatchType::FullPath => 2,
};
priority(&a.match_type)
.cmp(&priority(&b.match_type))
.then_with(|| a.path.cmp(&b.path))
});
priority(&a.match_type)
.cmp(&priority(&b.match_type))
.then_with(|| a.path.cmp(&b.path))
});
}
}
// Limit to top 10 results
results.truncate(10);

View File

@@ -52,3 +52,5 @@ strum = "0.27.2"
notify = "8.2.0"
notify-debouncer-full = "0.5.0"
dunce = "1.0"
dashmap = "6.1"
once_cell = "1.20"

View File

@@ -0,0 +1,157 @@
use std::{
collections::HashMap,
path::{Path, PathBuf},
sync::Arc,
time::Instant,
};
use chrono::{DateTime, Utc};
use dashmap::DashMap;
use db::models::project::{SearchMatchType, SearchResult};
use once_cell::sync::Lazy;
use tokio::task;
use super::git::{GitService, GitServiceError};
/// Statistics for a single file based on git history
#[derive(Clone, Debug)]
pub struct FileStat {
/// Index in the commit history (0 = HEAD, 1 = parent of HEAD, ...)
pub last_index: usize,
/// Number of times this file was changed in recent commits
pub commit_count: u32,
/// Timestamp of the most recent change
pub last_time: DateTime<Utc>,
}
/// File statistics for a repository
pub type FileStats = HashMap<String, FileStat>;
/// Cache entry for repository history
#[derive(Clone)]
struct RepoHistoryCache {
head_sha: String,
stats: Arc<FileStats>,
generated_at: Instant,
}
/// Global cache for file ranking statistics
static FILE_STATS_CACHE: Lazy<DashMap<PathBuf, RepoHistoryCache>> = Lazy::new(DashMap::new);
/// Configuration constants for ranking algorithm
const DEFAULT_COMMIT_LIMIT: usize = 100;
const BASE_MATCH_SCORE_FILENAME: i64 = 100;
const BASE_MATCH_SCORE_DIRNAME: i64 = 10;
const BASE_MATCH_SCORE_FULLPATH: i64 = 1;
const RECENCY_WEIGHT: i64 = 2;
const FREQUENCY_WEIGHT: i64 = 1;
/// Service for ranking files based on git history
pub struct FileRanker {
git_service: GitService,
}
impl Default for FileRanker {
fn default() -> Self {
Self::new()
}
}
impl FileRanker {
pub fn new() -> Self {
Self {
git_service: GitService::new(),
}
}
/// Get file statistics for a repository, using cache when possible
pub async fn get_stats(&self, repo_path: &Path) -> Result<Arc<FileStats>, GitServiceError> {
let repo_path = repo_path.to_path_buf();
// Check if we have a valid cache entry
if let Some(cache_entry) = FILE_STATS_CACHE.get(&repo_path) {
// Verify cache is still valid by checking HEAD
if let Ok(head_info) = self.git_service.get_head_info(&repo_path)
&& head_info.oid == cache_entry.head_sha
{
return Ok(Arc::clone(&cache_entry.stats));
}
}
// Cache miss or invalid - compute new stats
let stats = self.compute_stats(&repo_path).await?;
Ok(stats)
}
/// Re-rank search results based on git history statistics
pub fn rerank(&self, results: &mut [SearchResult], stats: &FileStats) {
results.sort_by(|a, b| {
let score_a = self.calculate_score(a, stats);
let score_b = self.calculate_score(b, stats);
score_b.cmp(&score_a) // Higher scores first
});
}
/// Calculate relevance score for a search result
fn calculate_score(&self, result: &SearchResult, stats: &FileStats) -> i64 {
let base_score = match result.match_type {
SearchMatchType::FileName => BASE_MATCH_SCORE_FILENAME,
SearchMatchType::DirectoryName => BASE_MATCH_SCORE_DIRNAME,
SearchMatchType::FullPath => BASE_MATCH_SCORE_FULLPATH,
};
if let Some(stat) = stats.get(&result.path) {
let recency_bonus = (100 - stat.last_index.min(99) as i64) * RECENCY_WEIGHT;
let frequency_bonus = stat.commit_count as i64 * FREQUENCY_WEIGHT;
// Multiply base score to maintain hierarchy, add git-based bonuses
base_score * 1000 + recency_bonus * 10 + frequency_bonus
} else {
// Files not in git history get base score only
base_score * 1000
}
}
/// Compute file statistics from git history
async fn compute_stats(&self, repo_path: &Path) -> Result<Arc<FileStats>, GitServiceError> {
let repo_path = repo_path.to_path_buf();
let repo_path_for_error = repo_path.clone();
let git_service = self.git_service.clone();
// Run git analysis in blocking task to avoid blocking async runtime
let stats = task::spawn_blocking(move || {
git_service.collect_recent_file_stats(&repo_path, DEFAULT_COMMIT_LIMIT)
})
.await
.map_err(|e| GitServiceError::InvalidRepository(format!("Task join error: {e}")))?;
let stats = match stats {
Ok(s) => s,
Err(e) => {
tracing::warn!(
"Failed to collect file stats for {:?}: {}",
repo_path_for_error,
e
);
// Return empty stats on error - search will still work without ranking
HashMap::new()
}
};
let stats_arc = Arc::new(stats);
// Update cache
if let Ok(head_info) = self.git_service.get_head_info(&repo_path_for_error) {
FILE_STATS_CACHE.insert(
repo_path_for_error,
RepoHistoryCache {
head_sha: head_info.oid,
stats: Arc::clone(&stats_arc),
generated_at: Instant::now(),
},
);
}
Ok(stats_arc)
}
}

View File

@@ -1,9 +1,9 @@
use std::path::Path;
use std::{collections::HashMap, path::Path};
use chrono::{DateTime, Utc};
use git2::{
BranchType, CherrypickOptions, Delta, DiffFindOptions, DiffOptions, Error as GitError,
FetchOptions, Repository, Status, StatusOptions, build::CheckoutBuilder,
FetchOptions, Repository, Sort, Status, StatusOptions, build::CheckoutBuilder,
};
use regex;
use serde::{Deserialize, Serialize};
@@ -11,6 +11,9 @@ use thiserror::Error;
use ts_rs::TS;
use utils::diff::{Diff, FileDiffDetails};
// Import for file ranking functionality
use super::file_ranker::FileStat;
#[derive(Debug, Error)]
pub enum GitServiceError {
#[error(transparent)]
@@ -46,6 +49,12 @@ pub struct GitBranch {
pub last_commit_date: DateTime<Utc>,
}
#[derive(Debug, Clone)]
pub struct HeadInfo {
pub branch: String,
pub oid: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, TS)]
pub struct BranchStatus {
pub commits_behind: Option<usize>,
@@ -646,13 +655,35 @@ impl GitService {
Ok(())
}
pub fn get_current_branch(&self, repo_path: &Path) -> Result<String, git2::Error> {
let repo = Repository::open(repo_path)?;
/// Get current HEAD information including branch name and commit OID
pub fn get_head_info(&self, repo_path: &Path) -> Result<HeadInfo, GitServiceError> {
let repo = self.open_repo(repo_path)?;
let head = repo.head()?;
if let Some(branch_name) = head.shorthand() {
Ok(branch_name.to_string())
let branch = if let Some(branch_name) = head.shorthand() {
branch_name.to_string()
} else {
Ok("HEAD".to_string())
"HEAD".to_string()
};
let oid = if let Some(target_oid) = head.target() {
target_oid.to_string()
} else {
// Handle case where HEAD exists but has no target (empty repo)
return Err(GitServiceError::InvalidRepository(
"Repository HEAD has no target commit".to_string(),
));
};
Ok(HeadInfo { branch, oid })
}
pub fn get_current_branch(&self, repo_path: &Path) -> Result<String, git2::Error> {
// Thin wrapper for backward compatibility
match self.get_head_info(repo_path) {
Ok(head_info) => Ok(head_info.branch),
Err(GitServiceError::Git(git_err)) => Err(git_err),
Err(_) => Err(git2::Error::from_str("Failed to get head info")),
}
}
@@ -1256,6 +1287,80 @@ impl GitService {
Ok(repo)
}
/// Collect file statistics from recent commits for ranking purposes
pub fn collect_recent_file_stats(
&self,
repo_path: &Path,
commit_limit: usize,
) -> Result<HashMap<String, FileStat>, GitServiceError> {
let repo = self.open_repo(repo_path)?;
let mut stats: HashMap<String, FileStat> = HashMap::new();
// Set up revision walk from HEAD
let mut revwalk = repo.revwalk()?;
revwalk.push_head()?;
revwalk.set_sorting(Sort::TIME)?;
// Iterate through recent commits
for (commit_index, oid_result) in revwalk.take(commit_limit).enumerate() {
let oid = oid_result?;
let commit = repo.find_commit(oid)?;
// Get commit timestamp
let commit_time = {
let time = commit.time();
DateTime::from_timestamp(time.seconds(), 0).unwrap_or_else(Utc::now)
};
// Get the commit tree
let commit_tree = commit.tree()?;
// For the first commit (no parent), diff against empty tree
let parent_tree = if commit.parent_count() == 0 {
None
} else {
Some(commit.parent(0)?.tree()?)
};
// Create diff between parent and current commit
let diff = repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&commit_tree), None)?;
// Process each changed file in this commit
diff.foreach(
&mut |delta, _progress| {
// Get the file path - prefer new file path, fall back to old
if let Some(path) = delta.new_file().path().or_else(|| delta.old_file().path())
{
let path_str = path.to_string_lossy().to_string();
// Update or insert file stats
let stat = stats.entry(path_str).or_insert(FileStat {
last_index: commit_index,
commit_count: 0,
last_time: commit_time,
});
// Increment commit count
stat.commit_count += 1;
// Keep the most recent change (smallest index)
if commit_index < stat.last_index {
stat.last_index = commit_index;
stat.last_time = commit_time;
}
}
true // Continue iteration
},
None, // No binary callback
None, // No hunk callback
None, // No line callback
)?;
}
Ok(stats)
}
}
// #[cfg(test)]

View File

@@ -3,6 +3,7 @@ pub mod auth;
pub mod config;
pub mod container;
pub mod events;
pub mod file_ranker;
pub mod filesystem;
pub mod filesystem_watcher;
pub mod git;