Skip to content

Commit b5a6a1b

Browse files
authored
feat: support included_patterns and excluded_patters for GoogleDrive source (#1263)
feat: support included_patterns and excluded_patters for GoogleDrive source
1 parent 07ce291 commit b5a6a1b

File tree

3 files changed

+30
-0
lines changed

3 files changed

+30
-0
lines changed

docs/docs/sources/googledrive.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ The spec takes the following fields:
3030
* `root_folder_ids` (`list[str]`): a list of Google Drive folder IDs to import files from.
3131
* `binary` (`bool`, optional): whether reading files as binary (instead of text).
3232
* `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
33+
* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. If not specified, all files will be included.
34+
* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded.
3335

3436
:::info
3537

python/cocoindex/sources/_engine_builtin_specs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ class GoogleDrive(op.SourceSpec):
3535
service_account_credential_path: str
3636
root_folder_ids: list[str]
3737
binary: bool = False
38+
39+
# If provided, only files matching these patterns will be included.
40+
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
41+
included_patterns: list[str] | None = None
42+
43+
# If provided, files matching these patterns will be excluded.
44+
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
45+
excluded_patterns: list[str] | None = None
46+
3847
recent_changes_poll_interval: datetime.timedelta | None = None
3948

4049

src/ops/sources/google_drive.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use super::shared::pattern_matcher::PatternMatcher;
12
use chrono::Duration;
23
use google_drive3::{
34
DriveHub,
@@ -59,13 +60,16 @@ pub struct Spec {
5960
binary: bool,
6061
root_folder_ids: Vec<String>,
6162
recent_changes_poll_interval: Option<std::time::Duration>,
63+
included_patterns: Option<Vec<String>>,
64+
excluded_patterns: Option<Vec<String>>,
6265
}
6366

6467
struct Executor {
6568
drive_hub: DriveHub<HttpsConnector<HttpConnector>>,
6669
binary: bool,
6770
root_folder_ids: IndexSet<Arc<str>>,
6871
recent_updates_poll_interval: Option<std::time::Duration>,
72+
pattern_matcher: PatternMatcher,
6973
}
7074

7175
impl Executor {
@@ -92,6 +96,7 @@ impl Executor {
9296
binary: spec.binary,
9397
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
9498
recent_updates_poll_interval: spec.recent_changes_poll_interval,
99+
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
95100
})
96101
}
97102
}
@@ -311,6 +316,9 @@ impl SourceExecutor for Executor {
311316
.list_files(&folder_id, &fields, &mut next_page_token)
312317
.await?;
313318
for file in files {
319+
if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){
320+
continue
321+
}
314322
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
315323
}
316324
if !curr_rows.is_empty() {
@@ -356,6 +364,17 @@ impl SourceExecutor for Executor {
356364
});
357365
}
358366
};
367+
if !file
368+
.name
369+
.as_deref()
370+
.is_some_and(|name| self.pattern_matcher.is_file_included(name))
371+
{
372+
return Ok(PartialSourceRowData {
373+
value: Some(SourceValue::NonExistence),
374+
ordinal: Some(Ordinal::unavailable()),
375+
content_version_fp: None,
376+
});
377+
}
359378
let ordinal = if options.include_ordinal {
360379
file.modified_time.map(|t| t.try_into()).transpose()?
361380
} else {

0 commit comments

Comments
 (0)