diff --git a/.gitignore b/.gitignore index 81b4d6ab..f7384e22 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ composer.lock var __solr docs/_build +.idea/ diff --git a/README.md b/README.md index cba8ce7a..5b1a53d7 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,38 @@ for more details on specific ones. netgen_ibexa_search_extra: use_loading_search_result_extractor: true ``` + +- Indexable implementation for [`BinaryFile`](https://github.com/netgen/ibexa-search-extra/blob/master/lib/Core/FieldType/BinaryFile/SearchField.php) + + This implementation enables file indexing by using Apache Tika to extract text + from a file and index that text. Apache Tika can be run as a server or just be + an executable jar file - both ways are supported. File indexing is disabled by default. + If you want to enable it, please specify one of the two following configurations: + + ```yaml + # Apache Tika as an executable jar file + netgen_ibexa_search_extra: + file_indexing: + enabled: true # is false by default + apache_tika: + mode: cli + path: '' + allowed_mime_types: # default value - not necessary here + - application/pdf + ``` + + ```yaml + # Apache Tika being run as a server + netgen_ibexa_search_extra: + file_indexing: + enabled: true # is false by default + apache_tika: + mode: server # default value + host: '127.0.0.1' # default value + port: '9998' # default value + allowed_mime_types: # default value - not necessary here + - application/pdf + ``` ## Installation diff --git a/bundle/DependencyInjection/Configuration.php b/bundle/DependencyInjection/Configuration.php index 456d7b8e..687ed3b2 100644 --- a/bundle/DependencyInjection/Configuration.php +++ b/bundle/DependencyInjection/Configuration.php @@ -31,13 +31,14 @@ public function getConfigTreeBuilder(): TreeBuilder $this->addFulltextBoostSection($rootNode); $this->addUsePageIndexingSection($rootNode); $this->addPageIndexingSection($rootNode); + $this->addFileIndexingSection($rootNode); return $treeBuilder; } private function addIndexableFieldTypeSection(ArrayNodeDefinition $nodeDefinition): void { - /** @noinspection NullPointerExceptionInspection */ + /* @noinspection NullPointerExceptionInspection */ $nodeDefinition ->children() ->arrayNode('indexable_field_type') @@ -61,7 +62,7 @@ private function addIndexableFieldTypeSection(ArrayNodeDefinition $nodeDefinitio private function addSearchResultExtractorSection(ArrayNodeDefinition $nodeDefinition): void { - /** @noinspection NullPointerExceptionInspection */ + /* @noinspection NullPointerExceptionInspection */ $nodeDefinition ->children() ->booleanNode('use_loading_search_result_extractor') @@ -73,7 +74,7 @@ private function addSearchResultExtractorSection(ArrayNodeDefinition $nodeDefini private function addAsynchronousIndexingSection(ArrayNodeDefinition $nodeDefinition): void { - /** @noinspection NullPointerExceptionInspection */ + /* @noinspection NullPointerExceptionInspection */ $nodeDefinition ->children() ->booleanNode('use_asynchronous_indexing') @@ -85,7 +86,7 @@ private function addAsynchronousIndexingSection(ArrayNodeDefinition $nodeDefinit private function addFulltextBoostSection(ArrayNodeDefinition $nodeDefinition): void { - /** @noinspection NullPointerExceptionInspection */ + /* @noinspection NullPointerExceptionInspection */ $nodeDefinition ->children() ->arrayNode('fulltext') @@ -249,4 +250,63 @@ private function addPageIndexingSection(ArrayNodeDefinition $nodeDefinition): vo ->end() ->end(); } + + private function addFileIndexingSection(ArrayNodeDefinition $nodeDefinition): void + { + $nodeDefinition + ->children() + ->arrayNode('file_indexing') + ->addDefaultsIfNotSet() + ->info('Configuration for file indexing') + ->children() + ->booleanNode('enabled') + ->info('Use file indexing') + ->defaultFalse() + ->end() + ->arrayNode('apache_tika') + ->addDefaultsIfNotSet() + ->info('Apache Tika configuration') + ->children() + ->scalarNode('mode') + ->info('Choose either cli or server') + ->defaultValue('server') + ->validate() + ->ifNotInArray(['cli', 'server']) + ->thenInvalid('Parameter `mode` must be either "cli" or "server"') + ->end() + ->end() + ->scalarNode('path') + ->info('Path to the Apache Tika JAR file') + ->defaultNull() + ->end() + ->scalarNode('host') + ->defaultValue('127.0.0.1') + ->end() + ->scalarNode('port') + ->defaultValue('9998') + ->end() + ->arrayNode('allowed_mime_types') + ->info('List of allowed MIME types for text extraction') + ->scalarPrototype()->end() + ->defaultValue([ + 'application/pdf', + ]) + ->end() + ->end() + ->validate() + ->ifTrue(static function ($v): bool { + return $v['mode'] === 'cli' && empty($v['path']); + }) + ->thenInvalid('Parameter `path` must be specified when `mode` is "cli".') + ->end() + ->validate() + ->ifTrue(static function ($v): bool { + return $v['mode'] === 'server' && (empty($v['host']) || empty($v['port'])); + }) + ->thenInvalid('Both parameters `host` and `port` must be specified when `mode` is "server".') + ->end() + ->end() + ->end() + ->end(); + } } diff --git a/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php b/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php index 3b68f079..c7220ba2 100644 --- a/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php +++ b/bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php @@ -7,10 +7,13 @@ use Symfony\Component\Config\FileLocator; use Symfony\Component\Config\Resource\FileResource; use Symfony\Component\DependencyInjection\ContainerBuilder; +use Symfony\Component\DependencyInjection\Definition; use Symfony\Component\DependencyInjection\Extension\PrependExtensionInterface; use Symfony\Component\DependencyInjection\Loader; use Symfony\Component\HttpKernel\DependencyInjection\Extension; use Symfony\Component\Yaml\Yaml; +use Vaites\ApacheTika\Clients\CLIClient; +use Vaites\ApacheTika\Clients\WebClient; use function array_key_exists; use function file_get_contents; @@ -52,6 +55,10 @@ public function load(array $configs, ContainerBuilder $container): void $loader->load('search/elasticsearch_services.yaml'); } + if (class_exists(CLIClient::class) && class_exists(WebClient::class)) { + $loader->load('search/file_indexing.yaml'); + } + $loader->load('search/common.yaml'); $this->processExtensionConfiguration($configs, $container); @@ -94,6 +101,7 @@ private function processExtensionConfiguration(array $configs, ContainerBuilder $this->processFullTextBoostConfiguration($configuration, $container); $this->processUsePageIndexingConfiguration($configuration, $container); $this->processPageIndexingConfiguration($configuration, $container); + $this->processFileIndexingConfiguration($configuration, $container); } private function processSearchResultExtractorConfiguration(array $configuration, ContainerBuilder $container): void @@ -157,4 +165,51 @@ private function processPageIndexingConfiguration(array $configuration, Containe $configuration['page_indexing']['enabled'] ?? false, ); } + + private function processFileIndexingConfiguration(array $configuration, ContainerBuilder $container): void + { + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.enabled', + $configuration['file_indexing']['enabled'], + ); + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.apache_tika.mode', + $configuration['file_indexing']['apache_tika']['mode'], + ); + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.apache_tika.path', + $configuration['file_indexing']['apache_tika']['path'], + ); + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.apache_tika.host', + $configuration['file_indexing']['apache_tika']['host'], + ); + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.apache_tika.port', + $configuration['file_indexing']['apache_tika']['port'], + ); + $container->setParameter( + 'netgen_ibexa_search_extra.file_indexing.apache_tika.allowed_mime_types', + $configuration['file_indexing']['apache_tika']['allowed_mime_types'], + ); + + if ($configuration['file_indexing']['enabled']) { + if ($configuration['file_indexing']['apache_tika']['mode'] === 'cli') { + $path = $configuration['file_indexing']['apache_tika']['path'] + ?? throw new \RuntimeException( + 'File indexing Apache Tika config: mode is set to cli, but path to JAR file is not set.', + ); + $apacheTikaClient = new Definition(CLIClient::class); + $apacheTikaClient->setArguments([$path]); + } else { + $host = $configuration['file_indexing']['apache_tika']['host'] ?? '127.0.0.1'; + $port = $configuration['file_indexing']['apache_tika']['port'] ?? '9998'; + $apacheTikaClient = new Definition(WebClient::class); + $apacheTikaClient->setArguments([$host, $port]); + } + + $apacheTikaClient->setPublic(true); + $container->setDefinition('apache_tika.client', $apacheTikaClient); + } + } } diff --git a/composer.json b/composer.json index 3f0267a3..8858aaf9 100644 --- a/composer.json +++ b/composer.json @@ -17,7 +17,8 @@ "ext-curl": "*", "ibexa/core": "^4.6", "symfony/messenger": "^5.4", - "symfony/proxy-manager-bridge": "^5.4" + "symfony/proxy-manager-bridge": "^5.4", + "vaites/php-apache-tika": "^1.4" }, "require-dev": { "ibexa/fieldtype-richtext": "^4.5", @@ -34,7 +35,8 @@ "suggest": { "netgen/ibexa-site-api": "Boost your site-building productivity with Ibexa CMS", "ibexa/solr": "Supports advanced capabilities with Ibexa search API", - "ibexa/elasticsearch": "Supports advanced capabilities with Ibexa search API" + "ibexa/elasticsearch": "Supports advanced capabilities with Ibexa search API", + "vaites/php-apache-tika": "^1.4" }, "autoload": { "psr-4": { diff --git a/lib/Core/FieldType/BinaryFile/SearchField.php b/lib/Core/FieldType/BinaryFile/SearchField.php new file mode 100644 index 00000000..465e0df6 --- /dev/null +++ b/lib/Core/FieldType/BinaryFile/SearchField.php @@ -0,0 +1,74 @@ +> */ + private array $cache = []; + + public function __construct( + private readonly Indexable $innerField, + private readonly FileTextExtractor $fileTextExtractor, + private readonly bool $fileIndexingEnabled, + ) {} + + public function getIndexData(Field $field, FieldDefinition $fieldDefinition): array + { + $searchFields = $this->innerField->getIndexData($field, $fieldDefinition); + + if (!$this->fileIndexingEnabled) { + return $searchFields; + } + + $text = $this->extractText($field); + + if ($text !== '') { + $searchFields[] = new Search\Field( + 'file_text', + $text, + new Search\FieldType\FullTextField(), + ); + } + + return $searchFields; + } + + public function getIndexDefinition(): array + { + return $this->innerField->getIndexDefinition(); + } + + public function getDefaultMatchField(): ?string + { + return $this->innerField->getDefaultMatchField(); + } + + public function getDefaultSortField(): ?string + { + return $this->innerField->getDefaultSortField(); + } + + private function extractText(Field $field): string + { + if (!isset($this->cache[$field->id][$field->versionNo])) { + $this->cache = []; + $this->cache[$field->id][$field->versionNo] = trim($this->fileTextExtractor->extractFromPersistenceField($field)); + } + + return $this->cache[$field->id][$field->versionNo]; + } +} diff --git a/lib/Core/Search/Common/PageIndexing/TextExtractor/FileTextExtractor.php b/lib/Core/Search/Common/PageIndexing/TextExtractor/FileTextExtractor.php new file mode 100644 index 00000000..e0ac352c --- /dev/null +++ b/lib/Core/Search/Common/PageIndexing/TextExtractor/FileTextExtractor.php @@ -0,0 +1,66 @@ +value->externalData['id']) + ? $this->extractByFileId($field->value->externalData['id']) + : ''; + } + + public function extractFromValue(Value $value): string + { + return $value->id === null ? '' : $this->extractByFileId($value->id); + } + + public function extractByFileId(string $fileId): string + { + if ($this->apacheTikaClient === null) { + return ''; + } + + $mimeType = $this->binaryFileIoService->getMimeType($fileId); + + if (!in_array($mimeType, $this->allowedMimeTypes, true)) { + return ''; + } + + $file = $this->binaryFileIoService->loadBinaryFile($fileId); + + try { + return $this->apacheTikaClient->getText(sprintf('public%s', $file->uri)); + } catch (\Exception $e) { + throw new RuntimeException( + sprintf( + 'Could not extract text from file with ID "%s": %s (%s)', + $fileId, + $e->getMessage(), + $e->getCode(), + ), + ); + } + } +} diff --git a/lib/Resources/config/search/file_indexing.yaml b/lib/Resources/config/search/file_indexing.yaml new file mode 100644 index 00000000..2411bc3c --- /dev/null +++ b/lib/Resources/config/search/file_indexing.yaml @@ -0,0 +1,13 @@ +services: + Netgen\IbexaSearchExtra\Core\FieldType\BinaryFile\SearchField: + decorates: Ibexa\Core\FieldType\BinaryFile\SearchField + arguments: + $innerField: '@.inner' + $fileTextExtractor: '@Netgen\IbexaSearchExtra\Core\Search\Common\PageIndexing\TextExtractor\FileTextExtractor' + $fileIndexingEnabled: '%netgen_ibexa_search_extra.file_indexing.enabled%' + + Netgen\IbexaSearchExtra\Core\Search\Common\PageIndexing\TextExtractor\FileTextExtractor: + arguments: + - '@ibexa.field_type.ezbinaryfile.io_service' + - '@?apache_tika.client' + - '%netgen_ibexa_search_extra.file_indexing.apache_tika.allowed_mime_types%'