Skip to content

Commit 292c504

Browse files
akoclaude
andcommitted
perf(catalog): O(1) unit name lookups + source-build progress
`refresh catalog source` resolved each document by re-reading and re-bson.Unmarshalling every unit on every describe call (GetRawUnitByName / GetRawMicroflowByName scanned the full unit list per call). With one describe per document, that's O(N²) — ~6 hours on a large app (#651). Fix: the reader builds a one-time index keyed by "$Type\x00QualifiedName" → unit, decoding only the Name field (a small struct, far cheaper than map[string]any). Lookups become O(1); the catalog's parallel describe shares one backend, so the index is built once and reused. Invalidated by InvalidateCache after writes. Also: buildSource Phase 2 reports incremental progress every 2s (was silent for the whole multi-minute/hour build — the #651 "no progress" complaint). Verified: GraphViewer source build (993 microflows) ~3.5min with live progress; the O(N²) (993² unmarshals ≈ 80min for microflows alone) is gone. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 1a78fbe commit 292c504

4 files changed

Lines changed: 113 additions & 64 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1818

1919
### Fixed
2020

21+
- **`refresh catalog source` no longer O(N²) on large projects** — it resolved each document by re-reading and re-`bson.Unmarshal`ing *every* unit on *every* describe call, so a big app (#651: ~3.3k microflows, ~33k activities) took ~6 hours. The reader now builds a one-time `$Type + qualified-name → unit` index (decoding only the `Name` field, not the whole document), making `GetRawUnitByName` / `GetRawMicroflowByName` O(1); the shared backend means the index is built once across the parallel describe workers. The source phase also reports incremental progress every 2s instead of going silent for the whole build. GraphViewer's source build (993 microflows) dropped to ~3.5 min with live progress; cloud-portal-scale projects go from hours to minutes
2122
- **Marketplace search now scans the whole catalog** — the Content API has no server-side search and caps `limit` at 100 per page, so `marketplace search` previously only filtered the first 100 items and silently missed matches further in (e.g. External Database Connector `219862`, Mendix Business Events `202649`). It now paginates via `offset`, fetching pages **concurrently** (first page alone so a common early match stays a single request; then bounded-parallel batches), and stops at `--limit` matches or end-of-catalog. Measured ~3m45s → ~44s on a slow link for a deep match; combined with the new cache, repeat searches are instant
2223

2324
## [0.12.0] - 2026-06-04

mdl/catalog/builder_source.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
package catalog
44

55
import (
6+
"fmt"
67
"runtime"
78
"sync"
9+
"sync/atomic"
10+
"time"
811
)
912

1013
// sourceItem represents a single element to generate MDL source for.
@@ -110,12 +113,31 @@ func (b *Builder) buildSource() error {
110113
return nil
111114
}
112115

113-
// Phase 2: Generate MDL source in parallel
116+
// Phase 2: Generate MDL source in parallel. This is the slow phase on large
117+
// projects (one describe per document), so report progress periodically —
118+
// from a single ticker goroutine, the only caller of report() during the
119+
// run, so the progress sink isn't written concurrently.
114120
numWorkers := max(min(runtime.NumCPU(), 8), 1)
115121

116122
results := make([]sourceResult, len(items))
117123
work := make(chan int, len(items))
118124

125+
var done atomic.Int64
126+
total := len(items)
127+
stop := make(chan struct{})
128+
go func() {
129+
ticker := time.NewTicker(2 * time.Second)
130+
defer ticker.Stop()
131+
for {
132+
select {
133+
case <-stop:
134+
return
135+
case <-ticker.C:
136+
b.report(fmt.Sprintf("source %d/%d documents", done.Load(), total), int(done.Load()))
137+
}
138+
}
139+
}()
140+
119141
var wg sync.WaitGroup
120142
for range numWorkers {
121143
wg.Go(func() {
@@ -125,6 +147,7 @@ func (b *Builder) buildSource() error {
125147
if err == nil && text != "" {
126148
results[idx] = sourceResult{item, text}
127149
}
150+
done.Add(1)
128151
}
129152
})
130153
}
@@ -134,6 +157,7 @@ func (b *Builder) buildSource() error {
134157
}
135158
close(work)
136159
wg.Wait()
160+
close(stop)
137161

138162
// Phase 3: Insert results into FTS5 table (serial — SQLite constraint)
139163
stmt, err := b.tx.Prepare(`

sdk/mpr/reader.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ type Reader struct {
3939
// Cache for unit metadata to avoid repeated file reads
4040
unitCache []cachedUnit
4141
unitCacheValid bool
42+
43+
// Lazily-built index of (unit $Type + qualified name) → unit, so name
44+
// lookups are O(1) instead of re-scanning and re-parsing every unit per
45+
// call (the source-catalog build does thousands of such lookups).
46+
nameIndex map[string]nameIndexEntry
47+
nameIndexBuilt bool
4248
}
4349

4450
// cachedUnit stores metadata about a unit for fast filtering.

sdk/mpr/reader_units.go

Lines changed: 81 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ func (r *Reader) buildUnitCache() error {
181181
// Should be called after any write operation.
182182
func (r *Reader) InvalidateCache() {
183183
r.unitCacheValid = false
184+
r.nameIndex = nil
185+
r.nameIndexBuilt = false
184186
}
185187

186188
// readMprContents reads content from the mprcontents folder for v2 format.
@@ -220,40 +222,90 @@ func getTypeFromContents(contents []byte) string {
220222
return ""
221223
}
222224

223-
// GetRawMicroflowByName returns the raw BSON contents for a microflow by qualified name.
224-
// Used for debugging to compare serialized data.
225-
func (r *Reader) GetRawMicroflowByName(qualifiedName string) ([]byte, error) {
226-
units, err := r.listUnitsByType("Microflows$Microflow")
225+
// nameIndexEntry records where a named unit lives.
226+
type nameIndexEntry struct {
227+
id string
228+
moduleName string
229+
}
230+
231+
// nameHeader decodes only the unit's Name field. Decoding into this small
232+
// struct is much cheaper than unmarshalling the whole document into
233+
// map[string]any (no map allocation, far less reflection).
234+
type nameHeader struct {
235+
Name string `bson:"Name"`
236+
}
237+
238+
// buildUnitNameIndex parses every unit's name once and indexes units by
239+
// "$Type\x00QualifiedName", so per-name lookups are O(1) instead of re-reading
240+
// and re-parsing every unit on each call. Idempotent; invalidated by
241+
// InvalidateCache after writes.
242+
func (r *Reader) buildUnitNameIndex() error {
243+
if r.nameIndexBuilt {
244+
return nil
245+
}
246+
units, err := r.listUnitsByType("")
227247
if err != nil {
228-
return nil, err
248+
return err
229249
}
230-
231-
// Build a map of container ID to module name
232250
modules, err := r.ListModules()
233251
if err != nil {
234-
return nil, err
252+
return err
235253
}
236-
moduleMap := make(map[string]string)
254+
moduleMap := make(map[string]string, len(modules))
237255
for _, m := range modules {
238256
moduleMap[string(m.ID)] = m.Name
239257
}
258+
containerParent, err := r.buildContainerParent()
259+
if err != nil {
260+
return err
261+
}
240262

263+
idx := make(map[string]nameIndexEntry, len(units))
241264
for _, u := range units {
242-
// Parse just enough to get the qualified name
243-
var raw map[string]any
244-
if err := bson.Unmarshal(u.Contents, &raw); err != nil {
265+
var h nameHeader
266+
if err := bson.Unmarshal(u.Contents, &h); err != nil || h.Name == "" {
245267
continue
246268
}
247-
248-
name, _ := raw["Name"].(string)
249-
// Get module name from container
250-
moduleName := moduleMap[u.ContainerID]
251-
if moduleName != "" && moduleName+"."+name == qualifiedName {
252-
return u.Contents, nil
269+
moduleName := resolveModuleName(u.ContainerID, moduleMap, containerParent)
270+
qn := h.Name
271+
if moduleName != "" {
272+
qn = moduleName + "." + h.Name
253273
}
274+
idx[u.Type+"\x00"+qn] = nameIndexEntry{id: u.ID, moduleName: moduleName}
254275
}
276+
r.nameIndex = idx
277+
r.nameIndexBuilt = true
278+
return nil
279+
}
255280

256-
return nil, fmt.Errorf("microflow not found: %s", qualifiedName)
281+
// lookupUnitByName resolves a (type, qualified name) to its unit via the name
282+
// index. Returns (nil, "", nil) when not found.
283+
func (r *Reader) lookupUnitByName(typePrefix, qualifiedName string) (*rawUnit, string, error) {
284+
if err := r.buildUnitNameIndex(); err != nil {
285+
return nil, "", err
286+
}
287+
e, ok := r.nameIndex[typePrefix+"\x00"+qualifiedName]
288+
if !ok {
289+
return nil, "", nil
290+
}
291+
u, err := r.getUnitByID(e.id)
292+
if err != nil {
293+
return nil, "", err
294+
}
295+
return u, e.moduleName, nil
296+
}
297+
298+
// GetRawMicroflowByName returns the raw BSON contents for a microflow by qualified name.
299+
// Used for debugging to compare serialized data.
300+
func (r *Reader) GetRawMicroflowByName(qualifiedName string) ([]byte, error) {
301+
u, _, err := r.lookupUnitByName("Microflows$Microflow", qualifiedName)
302+
if err != nil {
303+
return nil, err
304+
}
305+
if u == nil {
306+
return nil, fmt.Errorf("microflow not found: %s", qualifiedName)
307+
}
308+
return u.Contents, nil
257309
}
258310

259311
// RawUnitInfo contains information about a raw unit for BSON debugging.
@@ -309,54 +361,20 @@ func (r *Reader) GetRawUnitByName(objectType, qualifiedName string) (*RawUnitInf
309361
return r.getRawAssociationByName(qualifiedName)
310362
}
311363

312-
units, err := r.listUnitsByType(typePrefix)
313-
if err != nil {
314-
return nil, err
315-
}
316-
317-
// Build module name map and container hierarchy for MPR v2 folder support.
318-
modules, err := r.ListModules()
364+
u, moduleName, err := r.lookupUnitByName(typePrefix, qualifiedName)
319365
if err != nil {
320366
return nil, err
321367
}
322-
moduleMap := make(map[string]string)
323-
for _, m := range modules {
324-
moduleMap[string(m.ID)] = m.Name
368+
if u == nil {
369+
return nil, fmt.Errorf("%s not found: %s", objectType, qualifiedName)
325370
}
326-
containerParent, err := r.buildContainerParent()
327-
if err != nil {
328-
return nil, err
329-
}
330-
331-
for _, u := range units {
332-
var raw map[string]any
333-
if err := bson.Unmarshal(u.Contents, &raw); err != nil {
334-
continue
335-
}
336-
337-
name, _ := raw["Name"].(string)
338-
moduleName := resolveModuleName(u.ContainerID, moduleMap, containerParent)
339-
340-
// Build full name, handling missing module
341-
var fullName string
342-
if moduleName != "" {
343-
fullName = moduleName + "." + name
344-
} else {
345-
fullName = name
346-
}
347-
348-
if fullName == qualifiedName {
349-
return &RawUnitInfo{
350-
ID: u.ID,
351-
QualifiedName: fullName,
352-
Type: u.Type,
353-
ModuleName: moduleName,
354-
Contents: u.Contents,
355-
}, nil
356-
}
357-
}
358-
359-
return nil, fmt.Errorf("%s not found: %s", objectType, qualifiedName)
371+
return &RawUnitInfo{
372+
ID: u.ID,
373+
QualifiedName: qualifiedName,
374+
Type: u.Type,
375+
ModuleName: moduleName,
376+
Contents: u.Contents,
377+
}, nil
360378
}
361379

362380
// getRawEntityByName finds an entity within domain models.

0 commit comments

Comments
 (0)