Skip to content

Commit 3f4afe2

Browse files
committedApr 10, 2024··
Add targets_input field to azure_blob_storage input
1 parent bb359ee commit 3f4afe2

File tree

5 files changed

+237
-12
lines changed

5 files changed

+237
-12
lines changed
 

‎CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ All notable changes to this project will be documented in this file.
1414
- Field `auto_replay_nacks` added to all inputs that traditionally automatically retry nacked messages as a toggle for this behaviour.
1515
- New `retry` processor.
1616
- New `noop` cache.
17+
- Field `targets_input` added to the `azure_blob_storage` input.
1718

1819
### Fixed
1920

‎internal/docs/format_yaml.go

+6
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,8 @@ func (f FieldSpec) ToYAML(recurse bool) (*yaml.Node, error) {
725725
}
726726
return &node, nil
727727
}
728+
729+
_, isCore := f.Type.IsCoreComponent()
728730
if f.Kind == KindArray || f.Kind == Kind2DArray {
729731
s := []any{}
730732
if err := node.Encode(s); err != nil {
@@ -738,6 +740,10 @@ func (f FieldSpec) ToYAML(recurse bool) (*yaml.Node, error) {
738740
if err := node.Encode(s); err != nil {
739741
return nil, err
740742
}
743+
} else if isCore {
744+
if err := node.Encode(nil); err != nil {
745+
return nil, err
746+
}
741747
} else {
742748
if len(f.Examples) > 0 {
743749
if err := node.Encode(f.Examples[0]); err != nil {

‎internal/impl/azure/input_blob_storage.go

+160-11
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@ import (
88
"net/http"
99
"strings"
1010
"sync"
11+
"sync/atomic"
1112
"time"
1213

1314
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
1415
"github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
1516
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
17+
"github.com/Jeffail/gabs/v2"
1618

1719
"github.com/benthosdev/benthos/v4/internal/codec"
1820
"github.com/benthosdev/benthos/v4/internal/codec/interop"
@@ -26,13 +28,15 @@ const (
2628
bsiFieldContainer = "container"
2729
bsiFieldPrefix = "prefix"
2830
bsiFieldDeleteObjects = "delete_objects"
31+
bsiFieldTargetsInput = "targets_input"
2932
)
3033

3134
type bsiConfig struct {
3235
client *azblob.Client
3336
Container string
3437
Prefix string
3538
DeleteObjects bool
39+
FileReader *service.OwnedInput
3640
Codec interop.FallbackReaderCodec
3741
}
3842

@@ -57,6 +61,11 @@ func bsiConfigFromParsed(pConf *service.ParsedConfig) (conf bsiConfig, err error
5761
if conf.DeleteObjects, err = pConf.FieldBool(bsiFieldDeleteObjects); err != nil {
5862
return
5963
}
64+
if pConf.Contains(bsiFieldTargetsInput) {
65+
if conf.FileReader, err = pConf.FieldInput(bsiFieldTargetsInput); err != nil {
66+
return
67+
}
68+
}
6069
return
6170
}
6271

@@ -79,7 +88,11 @@ If the `+"`storage_connection_string`"+` does not contain the `+"`AccountName`"+
7988
8089
## Downloading Large Files
8190
82-
When downloading large files it's often necessary to process it in streamed parts in order to avoid loading the entire file in memory at a given time. In order to do this a `+"[`codec`](#codec)"+` can be specified that determines how to break the input into smaller individual messages.
91+
When downloading large files it's often necessary to process it in streamed parts in order to avoid loading the entire file in memory at a given time. In order to do this a `+"[`scanner`](#scanner)"+` can be specified that determines how to break the input into smaller individual messages.
92+
93+
## Streaming New Files
94+
95+
By default this input will consume all files found within the target container and will then gracefully terminate. This is referred to as a "batch" mode of operation. However, it's possible to instead configure a container as [an Event Grid source](https://learn.microsoft.com/en-gb/azure/event-grid/event-schema-blob-storage) and then use this as a `+"[`targets_input`](#targetsinput)"+`, in which case new files are consumed as they're uploaded and Benthos will continue listening for and downloading files as they arrive. This is referred to as a "streamed" mode of operation.
8396
8497
## Metadata
8598
@@ -109,6 +122,34 @@ You can access these metadata fields using [function interpolation](/docs/config
109122
Description("Whether to delete downloaded objects from the blob once they are processed.").
110123
Advanced().
111124
Default(false),
125+
service.NewInputField(bsiFieldTargetsInput).
126+
Description("EXPERIMENTAL: An optional source of download targets, configured as a [regular Benthos input](/docs/components/inputs/about). Each message yielded by this input should be a single structured object containing a field `name`, which represents the blob to be downloaded.").
127+
Optional().
128+
Version("4.27.0").
129+
Example(map[string]any{
130+
"mqtt": map[string]any{
131+
"urls": []any{
132+
"example.westeurope-1.ts.eventgrid.azure.net:8883",
133+
},
134+
"topics": []any{
135+
"some-topic",
136+
},
137+
},
138+
"processors": []any{
139+
map[string]any{
140+
"unarchive": map[string]any{
141+
"format": "json_array",
142+
},
143+
},
144+
map[string]any{
145+
"mapping": `if this.eventType == "Microsoft.Storage.BlobCreated" {
146+
root.name = this.data.url.parse_url().path.trim_prefix("/foocontainer/")
147+
} else {
148+
root = deleted()
149+
}`,
150+
},
151+
},
152+
}),
112153
)
113154
}
114155

@@ -120,11 +161,15 @@ func init() {
120161
return nil, err
121162
}
122163

123-
rdr, err := newAzureBlobStorage(conf, res.Logger())
124-
if err != nil {
164+
var rdr service.BatchInput
165+
if rdr, err = newAzureBlobStorage(conf, res.Logger()); err != nil {
125166
return nil, err
126167
}
127-
return service.AutoRetryNacksBatched(rdr), nil
168+
169+
if conf.FileReader == nil {
170+
rdr = service.AutoRetryNacksBatched(rdr)
171+
}
172+
return rdr, nil
128173
})
129174
if err != nil {
130175
panic(err)
@@ -179,13 +224,117 @@ type azurePendingObject struct {
179224
scanner interop.FallbackReaderStream
180225
}
181226

182-
type azureTargetReader struct {
227+
type azureTargetReader interface {
228+
Pop(ctx context.Context) (*azureObjectTarget, error)
229+
Close(context.Context) error
230+
}
231+
232+
func newAzureTargetReader(ctx context.Context, logger *service.Logger, conf bsiConfig) (azureTargetReader, error) {
233+
if conf.FileReader == nil {
234+
return newAzureTargetBatchReader(ctx, conf)
235+
}
236+
return &azureTargetStreamReader{
237+
input: conf.FileReader,
238+
log: logger,
239+
}, nil
240+
}
241+
242+
//------------------------------------------------------------------------------
243+
244+
type azureTargetStreamReader struct {
245+
pending []*azureObjectTarget
246+
input *service.OwnedInput
247+
log *service.Logger
248+
}
249+
250+
func (a *azureTargetStreamReader) Pop(ctx context.Context) (*azureObjectTarget, error) {
251+
if len(a.pending) > 0 {
252+
t := a.pending[0]
253+
a.pending = a.pending[1:]
254+
return t, nil
255+
}
256+
257+
for {
258+
next, ackFn, err := a.input.ReadBatch(ctx)
259+
if err != nil {
260+
if errors.Is(err, service.ErrEndOfInput) {
261+
return nil, io.EOF
262+
}
263+
return nil, err
264+
}
265+
266+
var pendingAcks int32
267+
var nackOnce sync.Once
268+
for _, msg := range next {
269+
mStructured, err := msg.AsStructured()
270+
if err != nil {
271+
a.log.With("error", err).Error("Failed to extract structured object from targets input message")
272+
continue
273+
}
274+
275+
name, _ := gabs.Wrap(mStructured).S("name").Data().(string)
276+
if name == "" {
277+
a.log.Warn("Targets input yielded a message that did not contain a `name` field")
278+
continue
279+
}
280+
281+
pendingAcks++
282+
283+
var ackOnce sync.Once
284+
a.pending = append(a.pending, &azureObjectTarget{
285+
key: name,
286+
ackFn: func(ctx context.Context, err error) (aerr error) {
287+
if err != nil {
288+
nackOnce.Do(func() {
289+
// Prevent future acks from triggering a delete.
290+
atomic.StoreInt32(&pendingAcks, -1)
291+
292+
// It's possible that this is called for one message
293+
// at the _exact_ same time as another is acked, but
294+
// if the acked message triggers a full ack of the
295+
// origin message then even though it shouldn't be
296+
// possible, it's also harmless.
297+
aerr = ackFn(ctx, err)
298+
})
299+
} else {
300+
ackOnce.Do(func() {
301+
if atomic.AddInt32(&pendingAcks, -1) == 0 {
302+
aerr = ackFn(ctx, nil)
303+
}
304+
})
305+
}
306+
return
307+
},
308+
})
309+
}
310+
311+
if len(a.pending) > 0 {
312+
t := a.pending[0]
313+
a.pending = a.pending[1:]
314+
return t, nil
315+
} else {
316+
// Ack the messages even though we didn't extract any valid names.
317+
_ = ackFn(ctx, nil)
318+
}
319+
}
320+
}
321+
322+
func (a *azureTargetStreamReader) Close(ctx context.Context) error {
323+
for _, p := range a.pending {
324+
_ = p.ackFn(ctx, errors.New("shutting down"))
325+
}
326+
return a.input.Close(ctx)
327+
}
328+
329+
//------------------------------------------------------------------------------
330+
331+
type azureTargetBatchReader struct {
183332
pending []*azureObjectTarget
184333
conf bsiConfig
185334
pager *runtime.Pager[azblob.ListBlobsFlatResponse]
186335
}
187336

188-
func newAzureTargetReader(ctx context.Context, conf bsiConfig) (*azureTargetReader, error) {
337+
func newAzureTargetBatchReader(ctx context.Context, conf bsiConfig) (*azureTargetBatchReader, error) {
189338
var maxResults int32 = 100
190339
params := &azblob.ListBlobsFlatOptions{
191340
MaxResults: &maxResults,
@@ -194,7 +343,7 @@ func newAzureTargetReader(ctx context.Context, conf bsiConfig) (*azureTargetRead
194343
params.Prefix = &conf.Prefix
195344
}
196345
pager := conf.client.NewListBlobsFlatPager(conf.Container, params)
197-
staticKeys := azureTargetReader{conf: conf}
346+
staticKeys := azureTargetBatchReader{conf: conf}
198347
if pager.More() {
199348
page, err := pager.NextPage(ctx)
200349
if err != nil {
@@ -210,7 +359,7 @@ func newAzureTargetReader(ctx context.Context, conf bsiConfig) (*azureTargetRead
210359
return &staticKeys, nil
211360
}
212361

213-
func (s *azureTargetReader) Pop(ctx context.Context) (*azureObjectTarget, error) {
362+
func (s *azureTargetBatchReader) Pop(ctx context.Context) (*azureObjectTarget, error) {
214363
if len(s.pending) == 0 && s.pager.More() {
215364
s.pending = nil
216365
page, err := s.pager.NextPage(ctx)
@@ -230,7 +379,7 @@ func (s *azureTargetReader) Pop(ctx context.Context) (*azureObjectTarget, error)
230379
return obj, nil
231380
}
232381

233-
func (s azureTargetReader) Close(context.Context) error {
382+
func (s azureTargetBatchReader) Close(context.Context) error {
234383
return nil
235384
}
236385

@@ -240,7 +389,7 @@ type azureBlobStorage struct {
240389
conf bsiConfig
241390

242391
objectScannerCtor interop.FallbackReaderCodec
243-
keyReader *azureTargetReader
392+
keyReader azureTargetReader
244393

245394
objectMut sync.Mutex
246395
object *azurePendingObject
@@ -259,7 +408,7 @@ func newAzureBlobStorage(conf bsiConfig, log *service.Logger) (*azureBlobStorage
259408

260409
func (a *azureBlobStorage) Connect(ctx context.Context) error {
261410
var err error
262-
a.keyReader, err = newAzureTargetReader(ctx, a.conf)
411+
a.keyReader, err = newAzureTargetReader(ctx, a.log, a.conf)
263412
return err
264413
}
265414

‎internal/impl/azure/integration_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,41 @@ input:
102102
)
103103
})
104104

105+
t.Run("blob_storage_streamed", func(t *testing.T) {
106+
template := `
107+
output:
108+
azure_blob_storage:
109+
blob_type: BLOCK
110+
container: $VAR1-$ID
111+
max_in_flight: 1
112+
path: $VAR2/${!count("$ID")}.txt
113+
public_access_level: PRIVATE
114+
storage_connection_string: $VAR3
115+
116+
input:
117+
azure_blob_storage:
118+
container: $VAR1-$ID
119+
prefix: $VAR2
120+
storage_connection_string: $VAR3
121+
targets_input:
122+
azure_blob_storage:
123+
container: $VAR1-$ID
124+
prefix: $VAR2
125+
storage_connection_string: $VAR3
126+
processors:
127+
- mapping: 'root.name = @blob_storage_key'
128+
`
129+
integration.StreamTests(
130+
integration.StreamTestOpenCloseIsolated(),
131+
integration.StreamTestStreamIsolated(10),
132+
).Run(
133+
t, template,
134+
integration.StreamTestOptVarOne(dummyContainer),
135+
integration.StreamTestOptVarTwo(dummyPrefix),
136+
integration.StreamTestOptVarThree(connString),
137+
)
138+
})
139+
105140
t.Run("blob_storage_append", func(t *testing.T) {
106141
template := `
107142
output:

‎website/docs/components/inputs/azure_blob_storage.md

+35-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ input:
4343
prefix: ""
4444
scanner:
4545
to_the_end: {}
46+
targets_input: null # No default (optional)
4647
```
4748
4849
</TabItem>
@@ -62,6 +63,7 @@ input:
6263
scanner:
6364
to_the_end: {}
6465
delete_objects: false
66+
targets_input: null # No default (optional)
6567
```
6668
6769
</TabItem>
@@ -80,7 +82,11 @@ If the `storage_connection_string` does not contain the `AccountName` parameter,
8082

8183
## Downloading Large Files
8284

83-
When downloading large files it's often necessary to process it in streamed parts in order to avoid loading the entire file in memory at a given time. In order to do this a [`codec`](#codec) can be specified that determines how to break the input into smaller individual messages.
85+
When downloading large files it's often necessary to process it in streamed parts in order to avoid loading the entire file in memory at a given time. In order to do this a [`scanner`](#scanner) can be specified that determines how to break the input into smaller individual messages.
86+
87+
## Streaming New Files
88+
89+
By default this input will consume all files found within the target container and will then gracefully terminate. This is referred to as a "batch" mode of operation. However, it's possible to instead configure a container as [an Event Grid source](https://learn.microsoft.com/en-gb/azure/event-grid/event-schema-blob-storage) and then use this as a [`targets_input`](#targetsinput), in which case new files are consumed as they're uploaded and Benthos will continue listening for and downloading files as they arrive. This is referred to as a "streamed" mode of operation.
8490

8591
## Metadata
8692

@@ -164,4 +170,32 @@ Whether to delete downloaded objects from the blob once they are processed.
164170
Type: `bool`
165171
Default: `false`
166172

173+
### `targets_input`
174+
175+
EXPERIMENTAL: An optional source of download targets, configured as a [regular Benthos input](/docs/components/inputs/about). Each message yielded by this input should be a single structured object containing a field `name`, which represents the blob to be downloaded.
176+
177+
178+
Type: `input`
179+
Requires version 4.27.0 or newer
180+
181+
```yml
182+
# Examples
183+
184+
targets_input:
185+
mqtt:
186+
topics:
187+
- some-topic
188+
urls:
189+
- example.westeurope-1.ts.eventgrid.azure.net:8883
190+
processors:
191+
- unarchive:
192+
format: json_array
193+
- mapping: |-
194+
if this.eventType == "Microsoft.Storage.BlobCreated" {
195+
root.name = this.data.url.parse_url().path.trim_prefix("/foocontainer/")
196+
} else {
197+
root = deleted()
198+
}
199+
```
200+
167201

0 commit comments

Comments
 (0)
Please sign in to comment.