diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index b5f95f7034f..a6e5e7ca6c3 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -347,10 +347,14 @@ def __init__( # skip it. This is generally used to avoid indexing extra sensitive # pages. labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP, + # if a page title matches one of these, or any of its ancestors match, + # the page will be skipped. This allows skipping entire folders/sections. + pages_to_skip: list[str] = [], ) -> None: self.batch_size = batch_size self.continue_on_failure = continue_on_failure self.labels_to_skip = set(labels_to_skip) + self.pages_to_skip = set(pages_to_skip) self.recursive_indexer: RecursiveIndexer | None = None self.index_origin = index_origin ( @@ -404,7 +408,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]: if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES else None ), - expand="body.storage.value,version", + expand="body.storage.value,version,ancestors", ) except Exception: logger.warning( @@ -427,7 +431,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]: if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES else None ), - expand="body.storage.value,version", + expand="body.storage.value,version,ancestors", ) ) except HTTPError as e: @@ -441,7 +445,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]: self.space, start=start_ind + i, limit=1, - expand="body.view.value,version", + expand="body.view.value,version,ancestors", ) ) @@ -596,6 +600,20 @@ def _get_doc_batch( if time_filter is None or time_filter(last_modified): page_id = page["id"] + page_title = page["title"] + + # Check if page or any of its ancestors should be skipped + if self.pages_to_skip: + ancestors = page.get("ancestors", []) + ancestor_titles = {ancestor.get("title", "") for ancestor in ancestors} + if page_title in self.pages_to_skip or ancestor_titles.intersection( + self.pages_to_skip + ): + logger.info( + f"Page '{page_title}' (ID: {page_id}) or one of its ancestors " + f"is in pages_to_skip list. Skipping." + ) + continue if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING: page_labels = self._fetch_labels(self.confluence_client, page_id) diff --git a/web/src/app/admin/connectors/confluence/page.tsx b/web/src/app/admin/connectors/confluence/page.tsx index 7b513333f43..fdf6252d06a 100644 --- a/web/src/app/admin/connectors/confluence/page.tsx +++ b/web/src/app/admin/connectors/confluence/page.tsx @@ -5,6 +5,7 @@ import { ConfluenceIcon, TrashIcon } from "@/components/icons/icons"; import { BooleanFormField, TextFormField, + TextArrayFieldBuilder, } from "@/components/admin/connectors/Field"; import { HealthCheckBanner } from "@/components/health/healthcheck"; import { CredentialForm } from "@/components/admin/connectors/CredentialForm"; @@ -272,6 +273,18 @@ const Main = () => { ), }, + { + header: "Pages to Skip", + key: "pages_to_skip", + getValue: (ccPairStatus) => { + const connectorConfig = + ccPairStatus.connector.connector_specific_config; + return connectorConfig.pages_to_skip && + connectorConfig.pages_to_skip.length > 0 + ? connectorConfig.pages_to_skip.join(", ") + : ""; + }, + }, ]} onUpdate={() => mutate("/api/manage/admin/connector/indexing-status") @@ -301,15 +314,29 @@ const Main = () => { /> } + formBodyBuilder={(values) => ( + <> + {TextArrayFieldBuilder({ + name: "pages_to_skip", + label: "Pages/Folders to Skip:", + subtext: + "Enter page titles to exclude from indexing. All child pages under these will also be skipped.", + })(values)} + + )} validationSchema={Yup.object().shape({ wiki_page_url: Yup.string().required( "Please enter any link to a Confluence space or Page e.g. https://danswer.atlassian.net/wiki/spaces/Engineering/overview" ), index_origin: Yup.boolean(), + pages_to_skip: Yup.array() + .of(Yup.string().required("Page names must be strings")) + .required(), })} initialValues={{ wiki_page_url: "", index_origin: true, + pages_to_skip: [], }} refreshFreq={10 * 60} // 10 minutes credentialId={confluenceCredential.id} diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index d1dae473573..43b18fcb03e 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -134,6 +134,7 @@ export interface BookstackConfig {} export interface ConfluenceConfig { wiki_page_url: string; index_origin?: boolean; + pages_to_skip?: string[]; } export interface JiraConfig {