diff --git a/packages/backend/src/repoIndexManager.ts b/packages/backend/src/repoIndexManager.ts index 8e499863..69226935 100644 --- a/packages/backend/src/repoIndexManager.ts +++ b/packages/backend/src/repoIndexManager.ts @@ -13,7 +13,7 @@ import { captureEvent } from './posthog.js'; import { PromClient } from './promClient.js'; import { RepoWithConnections, Settings } from "./types.js"; import { getAuthCredentialsForRepo, getShardPrefix, groupmqLifecycleExceptionWrapper, measure, setIntervalAsync } from './utils.js'; -import { indexGitRepository } from './zoekt.js'; +import { cleanupTempShards, indexGitRepository } from './zoekt.js'; const LOG_TAG = 'repo-index-manager'; const logger = createLogger(LOG_TAG); @@ -438,9 +438,17 @@ export class RepoIndexManager { } logger.info(`Indexing ${repo.name} (id: ${repo.id})...`); - const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); - const indexDuration_s = durationMs / 1000; - logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + try { + const { durationMs } = await measure(() => indexGitRepository(repo, this.settings, revisions, signal)); + const indexDuration_s = durationMs / 1000; + logger.info(`Indexed ${repo.name} (id: ${repo.id}) in ${indexDuration_s}s`); + } catch (error) { + // Clean up any temporary shard files left behind by the failed indexing operation. + // Zoekt creates .tmp files during indexing which can accumulate if indexing fails repeatedly. + logger.warn(`Indexing failed for ${repo.name} (id: ${repo.id}), cleaning up temp shard files...`); + await cleanupTempShards(repo); + throw error; + } return revisions; } diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 27f17d71..ff97d1ce 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,6 +1,7 @@ import { Repo } from "@sourcebot/db"; import { createLogger, env, getRepoPath } from "@sourcebot/shared"; import { exec } from "child_process"; +import { readdir, rm } from "fs/promises"; import { INDEX_CACHE_DIR } from "./constants.js"; import { Settings } from "./types.js"; import { getShardPrefix } from "./utils.js"; @@ -54,3 +55,34 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio }) }); } + +/** + * Cleans up temporary shard files left behind by a failed indexing operation. + * Zoekt creates temporary files (with `.tmp` suffix) during indexing, which + * can be left behind if the indexing process fails or is interrupted. + * + * @param repo - The repository whose temp shards should be cleaned up + */ +export const cleanupTempShards = async (repo: Repo) => { + const shardPrefix = getShardPrefix(repo.orgId, repo.id); + + try { + const files = await readdir(INDEX_CACHE_DIR); + const tempFiles = files.filter(file => + file.startsWith(shardPrefix) && file.includes('.tmp') + ); + + for (const file of tempFiles) { + const filePath = `${INDEX_CACHE_DIR}/${file}`; + logger.info(`Cleaning up temp shard file: ${filePath}`); + await rm(filePath, { force: true }); + } + + if (tempFiles.length > 0) { + logger.info(`Cleaned up ${tempFiles.length} temp shard file(s) for repo ${repo.id}`); + } + } catch (error) { + // Log but don't throw - cleanup is best effort + logger.warn(`Failed to cleanup temp shards for repo ${repo.id}:`, error); + } +}