Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,10 +347,14 @@ def __init__(
# skip it. This is generally used to avoid indexing extra sensitive
# pages.
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
# if a page title matches one of these, or any of its ancestors match,
# the page will be skipped. This allows skipping entire folders/sections.
pages_to_skip: list[str] = [],
) -> None:
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
self.labels_to_skip = set(labels_to_skip)
self.pages_to_skip = set(pages_to_skip)
self.recursive_indexer: RecursiveIndexer | None = None
self.index_origin = index_origin
(
Expand Down Expand Up @@ -404,7 +408,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
else None
),
expand="body.storage.value,version",
expand="body.storage.value,version,ancestors",
)
except Exception:
logger.warning(
Expand All @@ -427,7 +431,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
else None
),
expand="body.storage.value,version",
expand="body.storage.value,version,ancestors",
)
)
except HTTPError as e:
Expand All @@ -441,7 +445,7 @@ def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
self.space,
start=start_ind + i,
limit=1,
expand="body.view.value,version",
expand="body.view.value,version,ancestors",
)
)

Expand Down Expand Up @@ -596,6 +600,20 @@ def _get_doc_batch(

if time_filter is None or time_filter(last_modified):
page_id = page["id"]
page_title = page["title"]

# Check if page or any of its ancestors should be skipped
if self.pages_to_skip:
ancestors = page.get("ancestors", [])
ancestor_titles = {ancestor.get("title", "") for ancestor in ancestors}
if page_title in self.pages_to_skip or ancestor_titles.intersection(
self.pages_to_skip
):
logger.info(
f"Page '{page_title}' (ID: {page_id}) or one of its ancestors "
f"is in pages_to_skip list. Skipping."
)
continue

if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
page_labels = self._fetch_labels(self.confluence_client, page_id)
Expand Down
27 changes: 27 additions & 0 deletions web/src/app/admin/connectors/confluence/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { ConfluenceIcon, TrashIcon } from "@/components/icons/icons";
import {
BooleanFormField,
TextFormField,
TextArrayFieldBuilder,
} from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
Expand Down Expand Up @@ -272,6 +273,18 @@ const Main = () => {
</a>
),
},
{
header: "Pages to Skip",
key: "pages_to_skip",
getValue: (ccPairStatus) => {
const connectorConfig =
ccPairStatus.connector.connector_specific_config;
return connectorConfig.pages_to_skip &&
connectorConfig.pages_to_skip.length > 0
? connectorConfig.pages_to_skip.join(", ")
: "";
},
},
]}
onUpdate={() =>
mutate("/api/manage/admin/connector/indexing-status")
Expand Down Expand Up @@ -301,15 +314,29 @@ const Main = () => {
/>
</>
}
formBodyBuilder={(values) => (
<>
{TextArrayFieldBuilder({
name: "pages_to_skip",
label: "Pages/Folders to Skip:",
subtext:
"Enter page titles to exclude from indexing. All child pages under these will also be skipped.",
})(values)}
</>
)}
validationSchema={Yup.object().shape({
wiki_page_url: Yup.string().required(
"Please enter any link to a Confluence space or Page e.g. https://danswer.atlassian.net/wiki/spaces/Engineering/overview"
),
index_origin: Yup.boolean(),
pages_to_skip: Yup.array()
.of(Yup.string().required("Page names must be strings"))
.required(),
})}
initialValues={{
wiki_page_url: "",
index_origin: true,
pages_to_skip: [],
}}
refreshFreq={10 * 60} // 10 minutes
credentialId={confluenceCredential.id}
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ export interface BookstackConfig {}
export interface ConfluenceConfig {
wiki_page_url: string;
index_origin?: boolean;
pages_to_skip?: string[];
}

export interface JiraConfig {
Expand Down
Loading