Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
67f4b41
NGSTACK-809 add FileTextExtractor service that uses Apache Tika to ex…
AntePrkacin Jul 9, 2025
2640e24
NGSTACK-809 override Indexable BinaryFile SearchField by adding 'file…
AntePrkacin Jul 9, 2025
12915d1
NGSTACK-809 add Apache Tika jar file
AntePrkacin Jul 9, 2025
f9d8e96
NGSTACK-809 add new config params for 'file_text_extraction' key to b…
AntePrkacin Jul 9, 2025
802748f
NGSTACK-809 remove tika-app.jar file from bin directory
AntePrkacin Jul 11, 2025
f492a5d
NGSTACK-809 require vaites/php-apache-tika bundle
AntePrkacin Jul 11, 2025
061152d
NGSTACK-809 update config params for apache_tika node and instantiate…
AntePrkacin Jul 12, 2025
0cea80a
NGSTACK-809 update FileTextExtractor service to use newly required Va…
AntePrkacin Jul 12, 2025
f9e5121
NGSTACK-809 change configuration params to include 'file_indexing' an…
Sep 24, 2025
ac0745c
NGSTACK-809 inject fileIndexingEnabled config param into BinaryFile/S…
Sep 24, 2025
57a884b
NGSTACK-809 extract service definitions for SearchField and FileTextE…
Sep 24, 2025
d0e8115
NGSTACK-809 move 'vaites/php-apache-tika' bundle into 'suggest' part …
Sep 24, 2025
cdb458f
NGSTACK-809 update .gitignore file with .idea folder
Sep 24, 2025
69ccef6
NGSTACK-809 add defaults for 'file_indexing' configuration
Sep 24, 2025
349a7a1
NGSTACK-809 make 'apache.tika_client' dependency optional for FileTex…
Sep 24, 2025
796d58c
NGSTACK-809 update README.md with file_indexing info and config
Sep 24, 2025
4a53f1a
NGSTACK-809 remove FullTextField from getIndexDefinition() method in …
Sep 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ composer.lock
var
__solr
docs/_build
.idea/
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,38 @@ for more details on specific ones.
netgen_ibexa_search_extra:
use_loading_search_result_extractor: true
```

- Indexable implementation for [`BinaryFile`](https://github.com/netgen/ibexa-search-extra/blob/master/lib/Core/FieldType/BinaryFile/SearchField.php)

This implementation enables file indexing by using Apache Tika to extract text
from a file and index that text. Apache Tika can be run as a server or just be
an executable jar file - both ways are supported. File indexing is disabled by default.
If you want to enable it, please specify one of the two following configurations:

```yaml
# Apache Tika as an executable jar file
netgen_ibexa_search_extra:
file_indexing:
enabled: true # is false by default
apache_tika:
mode: cli
path: '<path/to/jar/file>'
allowed_mime_types: # default value - not necessary here
- application/pdf
```

```yaml
# Apache Tika being run as a server
netgen_ibexa_search_extra:
file_indexing:
enabled: true # is false by default
apache_tika:
mode: server # default value
host: '127.0.0.1' # default value
port: '9998' # default value
allowed_mime_types: # default value - not necessary here
- application/pdf
```

## Installation

Expand Down
68 changes: 64 additions & 4 deletions bundle/DependencyInjection/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@ public function getConfigTreeBuilder(): TreeBuilder
$this->addFulltextBoostSection($rootNode);
$this->addUsePageIndexingSection($rootNode);
$this->addPageIndexingSection($rootNode);
$this->addFileIndexingSection($rootNode);

return $treeBuilder;
}

private function addIndexableFieldTypeSection(ArrayNodeDefinition $nodeDefinition): void
{
/** @noinspection NullPointerExceptionInspection */
/* @noinspection NullPointerExceptionInspection */
$nodeDefinition
->children()
->arrayNode('indexable_field_type')
Expand All @@ -61,7 +62,7 @@ private function addIndexableFieldTypeSection(ArrayNodeDefinition $nodeDefinitio

private function addSearchResultExtractorSection(ArrayNodeDefinition $nodeDefinition): void
{
/** @noinspection NullPointerExceptionInspection */
/* @noinspection NullPointerExceptionInspection */
$nodeDefinition
->children()
->booleanNode('use_loading_search_result_extractor')
Expand All @@ -73,7 +74,7 @@ private function addSearchResultExtractorSection(ArrayNodeDefinition $nodeDefini

private function addAsynchronousIndexingSection(ArrayNodeDefinition $nodeDefinition): void
{
/** @noinspection NullPointerExceptionInspection */
/* @noinspection NullPointerExceptionInspection */
$nodeDefinition
->children()
->booleanNode('use_asynchronous_indexing')
Expand All @@ -85,7 +86,7 @@ private function addAsynchronousIndexingSection(ArrayNodeDefinition $nodeDefinit

private function addFulltextBoostSection(ArrayNodeDefinition $nodeDefinition): void
{
/** @noinspection NullPointerExceptionInspection */
/* @noinspection NullPointerExceptionInspection */
$nodeDefinition
->children()
->arrayNode('fulltext')
Expand Down Expand Up @@ -249,4 +250,63 @@ private function addPageIndexingSection(ArrayNodeDefinition $nodeDefinition): vo
->end()
->end();
}

private function addFileIndexingSection(ArrayNodeDefinition $nodeDefinition): void
{
$nodeDefinition
->children()
->arrayNode('file_indexing')
->addDefaultsIfNotSet()
->info('Configuration for file indexing')
->children()
->booleanNode('enabled')
->info('Use file indexing')
->defaultFalse()
->end()
->arrayNode('apache_tika')
->addDefaultsIfNotSet()
->info('Apache Tika configuration')
->children()
->scalarNode('mode')
->info('Choose either cli or server')
->defaultValue('server')
->validate()
->ifNotInArray(['cli', 'server'])
->thenInvalid('Parameter `mode` must be either "cli" or "server"')
->end()
->end()
->scalarNode('path')
->info('Path to the Apache Tika JAR file')
->defaultNull()
->end()
->scalarNode('host')
->defaultValue('127.0.0.1')
->end()
->scalarNode('port')
->defaultValue('9998')
->end()
->arrayNode('allowed_mime_types')
->info('List of allowed MIME types for text extraction')
->scalarPrototype()->end()
->defaultValue([
'application/pdf',
])
->end()
->end()
->validate()
->ifTrue(static function ($v): bool {
return $v['mode'] === 'cli' && empty($v['path']);
})
->thenInvalid('Parameter `path` must be specified when `mode` is "cli".')
->end()
->validate()
->ifTrue(static function ($v): bool {
return $v['mode'] === 'server' && (empty($v['host']) || empty($v['port']));
})
->thenInvalid('Both parameters `host` and `port` must be specified when `mode` is "server".')
->end()
->end()
->end()
->end();
}
}
55 changes: 55 additions & 0 deletions bundle/DependencyInjection/NetgenIbexaSearchExtraExtension.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
use Symfony\Component\Config\FileLocator;
use Symfony\Component\Config\Resource\FileResource;
use Symfony\Component\DependencyInjection\ContainerBuilder;
use Symfony\Component\DependencyInjection\Definition;
use Symfony\Component\DependencyInjection\Extension\PrependExtensionInterface;
use Symfony\Component\DependencyInjection\Loader;
use Symfony\Component\HttpKernel\DependencyInjection\Extension;
use Symfony\Component\Yaml\Yaml;
use Vaites\ApacheTika\Clients\CLIClient;
use Vaites\ApacheTika\Clients\WebClient;

use function array_key_exists;
use function file_get_contents;
Expand Down Expand Up @@ -52,6 +55,10 @@ public function load(array $configs, ContainerBuilder $container): void
$loader->load('search/elasticsearch_services.yaml');
}

if (class_exists(CLIClient::class) && class_exists(WebClient::class)) {
$loader->load('search/file_indexing.yaml');
}

$loader->load('search/common.yaml');

$this->processExtensionConfiguration($configs, $container);
Expand Down Expand Up @@ -94,6 +101,7 @@ private function processExtensionConfiguration(array $configs, ContainerBuilder
$this->processFullTextBoostConfiguration($configuration, $container);
$this->processUsePageIndexingConfiguration($configuration, $container);
$this->processPageIndexingConfiguration($configuration, $container);
$this->processFileIndexingConfiguration($configuration, $container);
}

private function processSearchResultExtractorConfiguration(array $configuration, ContainerBuilder $container): void
Expand Down Expand Up @@ -157,4 +165,51 @@ private function processPageIndexingConfiguration(array $configuration, Containe
$configuration['page_indexing']['enabled'] ?? false,
);
}

private function processFileIndexingConfiguration(array $configuration, ContainerBuilder $container): void
{
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.enabled',
$configuration['file_indexing']['enabled'],
);
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.apache_tika.mode',
$configuration['file_indexing']['apache_tika']['mode'],
);
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.apache_tika.path',
$configuration['file_indexing']['apache_tika']['path'],
);
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.apache_tika.host',
$configuration['file_indexing']['apache_tika']['host'],
);
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.apache_tika.port',
$configuration['file_indexing']['apache_tika']['port'],
);
$container->setParameter(
'netgen_ibexa_search_extra.file_indexing.apache_tika.allowed_mime_types',
$configuration['file_indexing']['apache_tika']['allowed_mime_types'],
);

if ($configuration['file_indexing']['enabled']) {
if ($configuration['file_indexing']['apache_tika']['mode'] === 'cli') {
$path = $configuration['file_indexing']['apache_tika']['path']
?? throw new \RuntimeException(
'File indexing Apache Tika config: mode is set to cli, but path to JAR file is not set.',
);
$apacheTikaClient = new Definition(CLIClient::class);
$apacheTikaClient->setArguments([$path]);
} else {
$host = $configuration['file_indexing']['apache_tika']['host'] ?? '127.0.0.1';
$port = $configuration['file_indexing']['apache_tika']['port'] ?? '9998';
$apacheTikaClient = new Definition(WebClient::class);
$apacheTikaClient->setArguments([$host, $port]);
}

$apacheTikaClient->setPublic(true);
$container->setDefinition('apache_tika.client', $apacheTikaClient);
}
}
}
6 changes: 4 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"ext-curl": "*",
"ibexa/core": "^4.6",
"symfony/messenger": "^5.4",
"symfony/proxy-manager-bridge": "^5.4"
"symfony/proxy-manager-bridge": "^5.4",
"vaites/php-apache-tika": "^1.4"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can make this optional if we check in the bundle Extension whether the package exists. The dependency should then be moved to suggest section of composer.json.

Copy link
Author

@AntePrkacin AntePrkacin Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

d0e8115
Moved vaites/php-apache-tika bundle under "suggest" section of composer.json file. In the bundle Extension, I check if the CLIClient and WebClient classes from vaites/php-apache-tika bundle exist and then load search/file_indexing.yaml.

},
"require-dev": {
"ibexa/fieldtype-richtext": "^4.5",
Expand All @@ -34,7 +35,8 @@
"suggest": {
"netgen/ibexa-site-api": "Boost your site-building productivity with Ibexa CMS",
"ibexa/solr": "Supports advanced capabilities with Ibexa search API",
"ibexa/elasticsearch": "Supports advanced capabilities with Ibexa search API"
"ibexa/elasticsearch": "Supports advanced capabilities with Ibexa search API",
"vaites/php-apache-tika": "^1.4"
},
"autoload": {
"psr-4": {
Expand Down
74 changes: 74 additions & 0 deletions lib/Core/FieldType/BinaryFile/SearchField.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
<?php

declare(strict_types=1);

namespace Netgen\IbexaSearchExtra\Core\FieldType\BinaryFile;

use Ibexa\Contracts\Core\FieldType\Indexable;
use Ibexa\Contracts\Core\Persistence\Content\Field;
use Ibexa\Contracts\Core\Persistence\Content\Type\FieldDefinition;
use Ibexa\Contracts\Core\Search;
use Netgen\IbexaSearchExtra\Core\Search\Common\PageIndexing\TextExtractor\FileTextExtractor;

use function trim;

/**
* Indexable definition for BinaryFile field type.
*/
final class SearchField implements Indexable
{
/** @var array<int, array<int, string>> */
private array $cache = [];

public function __construct(
private readonly Indexable $innerField,
private readonly FileTextExtractor $fileTextExtractor,
private readonly bool $fileIndexingEnabled,
) {}

public function getIndexData(Field $field, FieldDefinition $fieldDefinition): array
{
$searchFields = $this->innerField->getIndexData($field, $fieldDefinition);

if (!$this->fileIndexingEnabled) {
return $searchFields;
}

$text = $this->extractText($field);

if ($text !== '') {
$searchFields[] = new Search\Field(
'file_text',
$text,
new Search\FieldType\FullTextField(),
);
}

return $searchFields;
}

public function getIndexDefinition(): array
{
return $this->innerField->getIndexDefinition();
}

public function getDefaultMatchField(): ?string
{
return $this->innerField->getDefaultMatchField();
}

public function getDefaultSortField(): ?string
{
return $this->innerField->getDefaultSortField();
}

private function extractText(Field $field): string
{
if (!isset($this->cache[$field->id][$field->versionNo])) {
$this->cache = [];
$this->cache[$field->id][$field->versionNo] = trim($this->fileTextExtractor->extractFromPersistenceField($field));
}

return $this->cache[$field->id][$field->versionNo];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?php

declare(strict_types=1);

namespace Netgen\IbexaSearchExtra\Core\Search\Common\PageIndexing\TextExtractor;

use Ibexa\Contracts\Core\Persistence\Content\Field;
use Ibexa\Core\FieldType\BinaryBase\Value;
use Ibexa\Core\IO\IOServiceInterface;
use RuntimeException;
use Vaites\ApacheTika\Client as ApacheTikaClient;

use function in_array;
use function sprintf;

/**
* Extract text from Ibexa file based field types.
*/
final class FileTextExtractor
{
public function __construct(
private readonly IOServiceInterface $binaryFileIoService,
private readonly ?ApacheTikaClient $apacheTikaClient,
private readonly array $allowedMimeTypes,
) {}

public function extractFromPersistenceField(Field $field): string
{
return isset($field->value->externalData['id'])
? $this->extractByFileId($field->value->externalData['id'])
: '';
}

public function extractFromValue(Value $value): string
{
return $value->id === null ? '' : $this->extractByFileId($value->id);
}

public function extractByFileId(string $fileId): string
{
if ($this->apacheTikaClient === null) {
return '';
}

$mimeType = $this->binaryFileIoService->getMimeType($fileId);

if (!in_array($mimeType, $this->allowedMimeTypes, true)) {
return '';
}

$file = $this->binaryFileIoService->loadBinaryFile($fileId);

try {
return $this->apacheTikaClient->getText(sprintf('public%s', $file->uri));
} catch (\Exception $e) {
throw new RuntimeException(
sprintf(
'Could not extract text from file with ID "%s": %s (%s)',
$fileId,
$e->getMessage(),
$e->getCode(),
),
);
}
}
}
Loading
Loading