From bb12c479915a12f61b28d80b0b736e5d79e9744d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 28 Jan 2026 17:57:27 +0000 Subject: [PATCH] fix: cleanup temporary shard files on failed indexing When zoekt-git-index fails during repository indexing, it can leave behind .tmp shard files that accumulate over time and fill up disk space. This is especially problematic for large repos that repeatedly fail to index. Changes: - Add cleanupTempShards() function to zoekt.ts that removes temporary shard files (files with .tmp in their name) for a specific repository - Call cleanupTempShards() in repoIndexManager.ts when indexGitRepository fails, before re-throwing the error This ensures that even if a repository consistently fails to index, the temporary files created during each attempt are cleaned up. Co-authored-by: michael --- packages/backend/src/repoIndexManager.ts | 16 +++++++++--- packages/backend/src/zoekt.ts | 32 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 8e499863..69226935 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -13,7 +13,7 @@ import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; import { getAuthCredentialsForRepo, getShardPrefix, groupmqLifecycleExceptionWrapper, measure, setIntervalAsync } from './utils.js'; -import { indexGitRepository } from './zoekt.js'; +import { cleanupTempShards, indexGitRepository } from './zoekt.js'; const LOG_TAG = 'repo-index-manager'; const logger = createLogger(LOG_TAG); @@ -438,9 +438,17 @@ export class RepoIndexManager { } logger.info(`Indexing ${repo.name} (id: ${repo.id})...`); - const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); - const indexDuration_s = durationMs / 1000; - logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + try { + const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); + const indexDuration_s = durationMs / 1000; + logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + } catch (error) { + // Clean up any temporary shard files left behind by the failed indexing operation. + // Zoekt creates .tmp files during indexing which can accumulate if indexing fails repeatedly. + logger.warn(`Indexing failed for ${repo.name} (id: ${repo.id}), cleaning up temp shard files...`); + await cleanupTempShards(repo); + throw error; + } return revisions; } diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 27f17d71..ff97d1ce 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,6 +1,7 @@ import { Repo } from "@sourcebot/db"; import { createLogger, env, getRepoPath } from "@sourcebot/shared"; import { exec } from "child_process"; +import { readdir, rm } from "fs/promises"; import { INDEX_CACHE_DIR } from "./constants.js"; import { Settings } from "./types.js"; import { getShardPrefix } from "./utils.js"; @@ -54,3 +55,34 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio }) }); } + +/** + * Cleans up temporary shard files left behind by a failed indexing operation. + * Zoekt creates temporary files (with `.tmp` suffix) during indexing, which + * can be left behind if the indexing process fails or is interrupted. + * + * @param repo - The repository whose temp shards should be cleaned up + */ +export const cleanupTempShards = async (repo: Repo) => { + const shardPrefix = getShardPrefix(repo.orgId, repo.id); + + try { + const files = await readdir(INDEX_CACHE_DIR); + const tempFiles = files.filter(file => + file.startsWith(shardPrefix) && file.includes('.tmp') + ); + + for (const file of tempFiles) { + const filePath = `${INDEX_CACHE_DIR}/${file}`; + logger.info(`Cleaning up temp shard file: ${filePath}`); + await rm(filePath, { force: true }); + } + + if (tempFiles.length > 0) { + logger.info(`Cleaned up ${tempFiles.length} temp shard file(s) for repo ${repo.id}`); + } + } catch (error) { + // Log but don't throw - cleanup is best effort + logger.warn(`Failed to cleanup temp shards for repo ${repo.id}:`, error); + } +}