diff --git a/.gitignore b/.gitignore index 0346fa9..8d6c851 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ vendor/ phpunit.xml phpcs.xml .phpcs.xml +.phpunit.result.cache +.phpunit.cache diff --git a/features/distignore.feature b/features/distignore.feature index e7929ca..d9feaa3 100644 --- a/features/distignore.feature +++ b/features/distignore.feature @@ -379,3 +379,50 @@ Feature: Generate a distribution archive of a project with .distignore """ Error: Broken symlink at /symlink. Target missing at """ + + Scenario: Efficiently ignores directories with many files + # Performance test: ensure ignored directories are not scanned + # @see https://github.com/wp-cli/dist-archive-command/issues/XXX + Given an empty directory + And a foo/.distignore file: + """ + node_modules + .git + """ + And a foo/plugin.php file: + """ + + + + tests + + + + + + src + + + diff --git a/src/Dist_Archive_Command.php b/src/Dist_Archive_Command.php index 01ca0bb..624ca7b 100644 --- a/src/Dist_Archive_Command.php +++ b/src/Dist_Archive_Command.php @@ -493,10 +493,11 @@ protected function is_path_contains_symlink( $source_dir_path ) { private function get_file_list( $source_dir_path, $excluded = false ) { $included_files = []; - $excluded_files = []; - $iterator = new RecursiveIteratorIterator( - new RecursiveDirectoryIterator( $source_dir_path, RecursiveDirectoryIterator::SKIP_DOTS ), + $directory_iterator = new RecursiveDirectoryIterator( $source_dir_path, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iterator = new Distignore_Filter_Iterator( $directory_iterator, $this->checker, $source_dir_path ); + $iterator = new RecursiveIteratorIterator( + $filter_iterator, RecursiveIteratorIterator::SELF_FIRST ); @@ -505,34 +506,43 @@ private function get_file_list( $source_dir_path, $excluded = false ) { */ foreach ( $iterator as $item ) { $relative_filepath = str_replace( $source_dir_path, '', $item->getPathname() ); - try { - if ( $this->checker->isPathIgnored( $relative_filepath ) ) { - $excluded_files[] = $relative_filepath; - } else { - $included_files[] = $relative_filepath; - } - } catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) { + + // Check if this item had an error during filtering. + $error = $filter_iterator->getErrorForItem( $relative_filepath ); + if ( $error ) { if ( $item->isLink() && ! file_exists( (string) readlink( $item->getPathname() ) ) ) { WP_CLI::error( "Broken symlink at {$relative_filepath}. Target missing at {$item->getLinkTarget()}." ); } else { - WP_CLI::error( $exception->getMessage() ); + WP_CLI::error( $error->getMessage() ); } } - } - // Check all excluded directories and remove them from the excluded list if they contain included files. - foreach ( $excluded_files as $excluded_file_index => $excluded_relative_path ) { - if ( ! is_dir( $source_dir_path . $excluded_relative_path ) ) { - continue; + // Check if this item is ignored (directories may still be yielded even if ignored). + if ( ! $filter_iterator->isPathIgnoredCached( $relative_filepath ) ) { + $included_files[] = $relative_filepath; } - foreach ( $included_files as $included_relative_path ) { - if ( 0 === strpos( $included_relative_path, $excluded_relative_path ) ) { - unset( $excluded_files[ $excluded_file_index ] ); + } + + if ( $excluded ) { + // Get excluded files from the filter iterator. + $excluded_files = $filter_iterator->getExcludedFiles(); + + // Check all excluded directories and remove them from the excluded list if they contain included files. + foreach ( $excluded_files as $excluded_file_index => $excluded_relative_path ) { + if ( ! is_dir( $source_dir_path . $excluded_relative_path ) ) { + continue; + } + foreach ( $included_files as $included_relative_path ) { + if ( 0 === strpos( $included_relative_path, $excluded_relative_path ) ) { + unset( $excluded_files[ $excluded_file_index ] ); + } } } + + return $excluded_files; } - return $excluded ? $excluded_files : $included_files; + return $included_files; } /** diff --git a/src/Distignore_Filter_Iterator.php b/src/Distignore_Filter_Iterator.php new file mode 100644 index 0000000..513bbdd --- /dev/null +++ b/src/Distignore_Filter_Iterator.php @@ -0,0 +1,229 @@ + + */ + private $ignored_cache = []; + + /** + * List of excluded file paths (relative). + * + * @var string[] + */ + private $excluded_files = []; + + /** + * List of items that had errors during checking. + * + * @var array + */ + private $error_items = []; + + /** + * Constructor. + * + * @param RecursiveIterator $iterator Iterator to filter. + * @param GitIgnoreChecker $checker GitIgnore checker instance. + * @param string $source_dir_path Base directory path. + */ + public function __construct( RecursiveIterator $iterator, GitIgnoreChecker $checker, $source_dir_path ) { + parent::__construct( $iterator ); + $this->checker = $checker; + $this->source_dir_path = $source_dir_path; + } + + /** + * Check whether the current element of the iterator is acceptable. + * Filters out ignored files so they don't appear in the iteration. + * For directories, we're more conservative - we only filter them out + * if we're certain they and all their contents should be ignored. + * + * @return bool True if the element should be included, false otherwise. + */ + #[\ReturnTypeWillChange] + public function accept() { + /** @var SplFileInfo $item */ + $item = $this->current(); + + // Get relative path. + $pathname = $item->getPathname(); + $source_path_length = strlen( $this->source_dir_path ); + + if ( 0 === strpos( $pathname, $this->source_dir_path ) ) { + $relative_filepath = substr( $pathname, $source_path_length ); + } else { + $relative_filepath = $pathname; + } + + try { + $is_ignored = $this->isPathIgnoredCached( $relative_filepath ); + + if ( $is_ignored ) { + // Track this as excluded. + $this->excluded_files[] = $relative_filepath; + + // For files, we can safely filter them out. + if ( ! $item->isDir() ) { + return false; + } + + // For directories, only filter out if we're not going to descend + // (hasChildren will handle that check). + // We need to yield ignored directories so they can be tracked in exclude lists. + return true; + } + + return true; + } catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) { + // Store the error and yield the item so get_file_list can handle it. + $this->error_items[ $relative_filepath ] = $exception; + return true; + } + } + + /** + * Check if a path is ignored, with caching to avoid duplicate checks. + * + * @param string $relative_filepath Relative file path to check. + * @return bool True if the path is ignored, false otherwise. + * @throws \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException + */ + public function isPathIgnoredCached( $relative_filepath ) { + if ( ! isset( $this->ignored_cache[ $relative_filepath ] ) ) { + $this->ignored_cache[ $relative_filepath ] = $this->checker->isPathIgnored( $relative_filepath ); + } + return $this->ignored_cache[ $relative_filepath ]; + } + + /** + * Check whether the current element has children that should be recursed into. + * We return false for certain ignored directories to prevent descending into them. + * + * This optimization only applies to directories that appear to be "leaf" ignore + * patterns (simple directory names without wildcards), to safely handle cases + * like `node_modules` while still correctly processing complex patterns with + * negations like `frontend/*` with `!/frontend/build/`. + * + * @return bool True if we should descend into this directory, false otherwise. + */ + #[\ReturnTypeWillChange] + public function hasChildren() { + /** @var SplFileInfo $item */ + $item = $this->current(); + + // If it's not a directory, it has no children. + if ( ! $item->isDir() ) { + return false; + } + + // For directories, check if they should be ignored. + $pathname = $item->getPathname(); + $source_path_length = strlen( $this->source_dir_path ); + + // Extract relative path by removing the source directory prefix. + if ( 0 === strpos( $pathname, $this->source_dir_path ) ) { + $relative_filepath = substr( $pathname, $source_path_length ); + } else { + // Fallback if path doesn't start with source path (shouldn't happen). + $relative_filepath = $pathname; + } + + try { + $is_ignored = $this->isPathIgnoredCached( $relative_filepath ); + + if ( ! $is_ignored ) { + // Not ignored, so descend. + return true; + } + + // Directory is ignored. Check if it's safe to skip descent. + // We only skip for single-level directories (no slashes except leading/trailing) + // to avoid issues with wildcard patterns and negations. + $path_parts = explode( '/', trim( $relative_filepath, '/' ) ); + if ( count( $path_parts ) === 1 ) { + // This is a top-level ignored directory like "/node_modules" or "/.git". + // It's likely safe to skip descent as these are typically simple patterns. + // However, we still need to be conservative. Let's check if a child would be ignored. + // We use 'test' as a probe filename to check if children would be ignored. + // The actual name doesn't matter; we just need to verify the pattern applies to children. + $test_child = $relative_filepath . '/test'; + try { + $child_ignored = $this->isPathIgnoredCached( $test_child ); + if ( $child_ignored ) { + // Child is also ignored, safe to skip descent. + return false; + } + } catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) { + // On error, descend to be safe. + return true; + } + } + + // For nested directories or if test shows children might not be ignored, descend. + return true; + } catch ( \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException $exception ) { + // If there's an error checking, allow descending (error will be handled in get_file_list). + return true; + } + } + + /** + * Return the inner iterator's children wrapped in this filter. + * + * @return RecursiveFilterIterator + */ + #[\ReturnTypeWillChange] + public function getChildren() { + /** @var RecursiveDirectoryIterator $inner */ + $inner = $this->getInnerIterator(); + // Pass the same arrays by reference so they accumulate across all levels. + $child = new self( $inner->getChildren(), $this->checker, $this->source_dir_path ); + $child->excluded_files = &$this->excluded_files; + $child->ignored_cache = &$this->ignored_cache; + $child->error_items = &$this->error_items; + return $child; + } + + /** + * Get the list of excluded files that were filtered out. + * + * @return string[] + */ + public function getExcludedFiles() { + return $this->excluded_files; + } + + /** + * Check if an item had an error during processing. + * + * @param string $relative_filepath Relative file path to check. + * @return \Inmarelibero\GitIgnoreChecker\Exception\InvalidArgumentException|null + */ + public function getErrorForItem( $relative_filepath ) { + return $this->error_items[ $relative_filepath ] ?? null; + } +} diff --git a/tests/Distignore_Filter_Iterator_Test.php b/tests/Distignore_Filter_Iterator_Test.php new file mode 100644 index 0000000..f147a5c --- /dev/null +++ b/tests/Distignore_Filter_Iterator_Test.php @@ -0,0 +1,264 @@ +temp_dir = sys_get_temp_dir() . '/distignore-test-' . uniqid(); + mkdir( $this->temp_dir ); + } + + /** + * Clean up test environment. + */ + public function tearDown(): void { + if ( is_dir( $this->temp_dir ) ) { + $this->recursiveDelete( $this->temp_dir ); + } + parent::tearDown(); + } + + /** + * Recursively delete a directory. + * + * @param string $dir Directory to delete. + */ + private function recursiveDelete( $dir ) { + if ( ! is_dir( $dir ) ) { + return; + } + $files = array_diff( scandir( $dir ), array( '.', '..' ) ); + foreach ( $files as $file ) { + $path = $dir . '/' . $file; + is_dir( $path ) ? $this->recursiveDelete( $path ) : unlink( $path ); + } + rmdir( $dir ); + } + + /** + * Test that the iterator filters out ignored files. + */ + public function test_filters_ignored_files() { + // Create test structure. + file_put_contents( $this->temp_dir . '/included.txt', 'test' ); + file_put_contents( $this->temp_dir . '/ignored.log', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "*.log\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + $files = []; + foreach ( $recursive_iter as $item ) { + $files[] = basename( $item->getPathname() ); + } + + $this->assertContains( 'included.txt', $files ); + $this->assertContains( '.distignore', $files ); + $this->assertNotContains( 'ignored.log', $files, 'Ignored file should not be yielded' ); + } + + /** + * Test that ignored directories are tracked but files inside are not yielded. + */ + public function test_tracks_ignored_directories() { + // Create test structure. + mkdir( $this->temp_dir . '/node_modules' ); + file_put_contents( $this->temp_dir . '/node_modules/package.json', '{}' ); + file_put_contents( $this->temp_dir . '/index.php', 'temp_dir . '/.distignore', "node_modules\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + $files = []; + foreach ( $recursive_iter as $item ) { + $relative_path = str_replace( $this->temp_dir, '', $item->getPathname() ); + $files[] = $relative_path; + } + + $this->assertContains( '/index.php', $files ); + $this->assertContains( '/.distignore', $files ); + $this->assertContains( '/node_modules', $files, 'Ignored directory should be yielded for tracking' ); + $this->assertNotContains( '/node_modules/package.json', $files, 'Files inside ignored directory should not be yielded' ); + } + + /** + * Test that getExcludedFiles returns the correct list. + */ + public function test_get_excluded_files() { + mkdir( $this->temp_dir . '/ignored_dir' ); + file_put_contents( $this->temp_dir . '/ignored_dir/file.txt', 'test' ); + file_put_contents( $this->temp_dir . '/included.txt', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "ignored_dir\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + // Iterate to populate excluded files. + iterator_to_array( $recursive_iter ); + + $excluded = $filter_iter->getExcludedFiles(); + + $this->assertContains( '/ignored_dir', $excluded ); + $this->assertNotContains( '/included.txt', $excluded ); + } + + /** + * Test caching behavior to avoid duplicate checks. + */ + public function test_caching_avoids_duplicate_checks() { + file_put_contents( $this->temp_dir . '/test.txt', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "*.log\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + + // First call should cache the result. + $result1 = $filter_iter->isPathIgnoredCached( '/test.txt' ); + // Second call should use cache. + $result2 = $filter_iter->isPathIgnoredCached( '/test.txt' ); + + $this->assertSame( $result1, $result2 ); + $this->assertFalse( $result1 ); // test.txt should not be ignored. + } + + /** + * Test that hasChildren prevents descent into ignored directories. + */ + public function test_has_children_prevents_descent() { + mkdir( $this->temp_dir . '/node_modules' ); + file_put_contents( $this->temp_dir . '/node_modules/file1.js', 'test' ); + file_put_contents( $this->temp_dir . '/node_modules/file2.js', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "node_modules\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + $files = []; + foreach ( $recursive_iter as $item ) { + $relative_path = str_replace( $this->temp_dir, '', $item->getPathname() ); + $files[] = $relative_path; + } + + // The node_modules directory should be yielded but its files should not. + $this->assertContains( '/node_modules', $files ); + $this->assertNotContains( '/node_modules/file1.js', $files ); + $this->assertNotContains( '/node_modules/file2.js', $files ); + } + + /** + * Test handling of negation patterns. + */ + public function test_negation_patterns() { + mkdir( $this->temp_dir . '/frontend' ); + mkdir( $this->temp_dir . '/frontend/build' ); + file_put_contents( $this->temp_dir . '/frontend/source.ts', 'test' ); + file_put_contents( $this->temp_dir . '/frontend/build/output.js', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "frontend/*\n!/frontend/build/\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + $files = []; + foreach ( $recursive_iter as $item ) { + $relative_path = str_replace( $this->temp_dir, '', $item->getPathname() ); + $files[] = $relative_path; + } + + $this->assertContains( '/frontend', $files ); + $this->assertContains( '/frontend/build', $files ); + $this->assertContains( '/frontend/build/output.js', $files, 'Negated path should be included' ); + $this->assertNotContains( '/frontend/source.ts', $files, 'Ignored file should not be included' ); + } + + /** + * Test getErrorForItem returns null when no error. + */ + public function test_get_error_for_item_returns_null() { + file_put_contents( $this->temp_dir . '/test.txt', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', '' ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + + $error = $filter_iter->getErrorForItem( '/test.txt' ); + + $this->assertNull( $error ); + } + + /** + * Test that multiple levels of directories are handled correctly. + */ + public function test_nested_directory_filtering() { + mkdir( $this->temp_dir . '/src' ); + mkdir( $this->temp_dir . '/src/components' ); + file_put_contents( $this->temp_dir . '/src/index.php', 'temp_dir . '/src/components/widget.php', 'temp_dir . '/.distignore', '' ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + $files = []; + foreach ( $recursive_iter as $item ) { + $relative_path = str_replace( $this->temp_dir, '', $item->getPathname() ); + $files[] = $relative_path; + } + + $this->assertContains( '/src', $files ); + $this->assertContains( '/src/components', $files ); + $this->assertContains( '/src/index.php', $files ); + $this->assertContains( '/src/components/widget.php', $files ); + } + + /** + * Test that children share the same cache and excluded files arrays. + */ + public function test_children_share_state() { + mkdir( $this->temp_dir . '/level1' ); + mkdir( $this->temp_dir . '/level1/level2' ); + file_put_contents( $this->temp_dir . '/level1/file1.txt', 'test' ); + file_put_contents( $this->temp_dir . '/level1/level2/file2.log', 'test' ); + file_put_contents( $this->temp_dir . '/.distignore', "*.log\n" ); + + $checker = new GitIgnoreChecker( $this->temp_dir, '.distignore' ); + $directory_iter = new RecursiveDirectoryIterator( $this->temp_dir, RecursiveDirectoryIterator::SKIP_DOTS ); + $filter_iter = new Distignore_Filter_Iterator( $directory_iter, $checker, $this->temp_dir ); + $recursive_iter = new RecursiveIteratorIterator( $filter_iter, RecursiveIteratorIterator::SELF_FIRST ); + + // Iterate to populate excluded files. + iterator_to_array( $recursive_iter ); + + $excluded = $filter_iter->getExcludedFiles(); + + // The .log file in level2 should be tracked even though it was found by a child iterator. + $this->assertContains( '/level1/level2/file2.log', $excluded ); + } +}