Skip to content

Commit 0574799

Browse files
Merge pull request #115 from Financial-Times/fix/UPPSF-6457
Refactor bodyXML → bodyTree Go transformer to use internalComponents format only
2 parents b71779d + 8607b0d commit 0574799

24 files changed

+379
-1556
lines changed

libraries/from-bodyxml/go/README.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
# XML to Content Tree Transformer
22

33
## Overview
4-
The Transformer converts external XHTML-formatted document into content tree.
5-
It supports format stored in the **internalComponent** collection as well as the one returned by the **Internal Content API**.
6-
The latter is produced by the content-public-read service after applying certain transformations to the bodyXML it retrieves from the internalComponents collection.
7-
These transformations include renaming the content, related, and concept tags to ft-content, ft-related, and ft-concept, respectively, and replacing the id attribute with url, with a few caveats.
4+
The Transformer converts external XHTML-formatted document into content tree. It supports the bodyXML format used in the main content store within the Content & Metadata platform — specifically, in the **internalComponent** collection.
5+
86

97
## Usage
108

@@ -28,4 +26,9 @@ func main() {
2826

2927
fmt.Printf("Transformed content tree: %+v\n", out)
3028
}
31-
```
29+
```
30+
31+
## Known Limitations and Behavior
32+
The current implementation of the transformer has the following limitations:
33+
- If the transformer encounters an HTML tag that does not have a corresponding definition in the content tree, that tag is skipped.
34+
- If an HTML element contains child elements that are not allowed, those disallowed children are ignored.

libraries/from-bodyxml/go/helpers.go

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -75,30 +75,3 @@ func valueOr(v, fallback string) string {
7575
func attr(el *etree.Element, name string) string {
7676
return el.SelectAttrValue(name, "")
7777
}
78-
79-
var contentTypeTemplates = map[string]string{
80-
"http://www.ft.com/ontology/content/Article": "/content/{{id}}",
81-
"http://www.ft.com/ontology/content/ImageSet": "/content/{{id}}",
82-
"http://www.ft.com/ontology/content/ClipSet": "/content/{{id}}",
83-
"http://www.ft.com/ontology/content/CustomCodeComponent": "/content/{{id}}",
84-
"http://www.ft.com/ontology/content/MediaResource": "/content/{{id}}",
85-
"http://www.ft.com/ontology/content/Video": "/content/{{id}}",
86-
"http://www.ft.com/ontology/company/PublicCompany": "/organisations/{{id}}",
87-
"http://www.ft.com/ontology/content/ContentPackage": "/content/{{id}}",
88-
"http://www.ft.com/ontology/content/Content": "/content/{{id}}",
89-
"http://www.ft.com/ontology/content/Image": "/content/{{id}}",
90-
"http://www.ft.com/ontology/content/DynamicContent": "/content/{{id}}",
91-
"http://www.ft.com/ontology/content/Graphic": "/content/{{id}}",
92-
"http://www.ft.com/ontology/content/Audio": "/content/{{id}}",
93-
"http://www.ft.com/ontology/company/Organisation": "/organisations/{{id}}",
94-
}
95-
96-
func generateUrl(t, id string) string {
97-
const host = "http://api.ft.com"
98-
template, ok := contentTypeTemplates[t]
99-
if !ok {
100-
return ""
101-
}
102-
path := strings.Replace(template, "{{id}}", id, 1)
103-
return host + path
104-
}

libraries/from-bodyxml/go/html_transformers.go

Lines changed: 15 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ type transformer func(el *etree.Element) contenttree.Node
6565

6666
var defaultTransformers = map[string]transformer{
6767
"h1": func(h1 *etree.Element) contenttree.Node {
68-
dfrgId := valueOr(attr(h1, "data-fragment-identifier"), attr(h1, "id"))
68+
dfrgId := attr(h1, "data-fragment-identifier")
6969
heading := &contenttree.Heading{
7070
Type: contenttree.HeadingType,
7171
Level: "chapter",
@@ -75,7 +75,7 @@ var defaultTransformers = map[string]transformer{
7575
return heading
7676
},
7777
"h2": func(h2 *etree.Element) contenttree.Node {
78-
dfrgId := valueOr(attr(h2, "data-fragment-identifier"), attr(h2, "id"))
78+
dfrgId := attr(h2, "data-fragment-identifier")
7979
return &contenttree.Heading{
8080
Type: contenttree.HeadingType,
8181
Level: "subheading",
@@ -84,7 +84,7 @@ var defaultTransformers = map[string]transformer{
8484
}
8585
},
8686
"h3": func(h3 *etree.Element) contenttree.Node {
87-
dfrgId := valueOr(attr(h3, "data-fragment-identifier"), attr(h3, "id"))
87+
dfrgId := attr(h3, "data-fragment-identifier")
8888
return &contenttree.Heading{
8989
Type: contenttree.HeadingType,
9090
Level: "subheading",
@@ -93,7 +93,7 @@ var defaultTransformers = map[string]transformer{
9393
}
9494
},
9595
"h4": func(h4 *etree.Element) contenttree.Node {
96-
dfrgId := valueOr(attr(h4, "data-fragment-identifier"), attr(h4, "id"))
96+
dfrgId := attr(h4, "data-fragment-identifier")
9797
return &contenttree.Heading{
9898
Type: contenttree.HeadingType,
9999
Level: "label",
@@ -232,33 +232,27 @@ var defaultTransformers = map[string]transformer{
232232
Caption: attr(img, "longdesc"),
233233
}
234234
},
235-
236235
contentType.ImageSet: func(content *etree.Element) contenttree.Node {
237-
dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id"))
236+
dfrgId := attr(content, "data-fragment-identifier")
238237
return &contenttree.ImageSet{
239238
Type: contenttree.ImageSetType,
240-
ID: attr(content, "url"),
239+
ID: attr(content, "id"),
241240
FragmentIdentifier: dfrgId,
242241
}
243242
},
244243
contentType.Video: func(content *etree.Element) contenttree.Node {
245244
return &contenttree.Video{
246245
Type: contenttree.VideoType,
247-
ID: attr(content, "url"),
246+
ID: attr(content, "id"),
248247
}
249248
},
250249
contentType.Content: func(content *etree.Element) contenttree.Node {
251-
id := attr(content, "url")
252-
parts := strings.Split(id, "/")
253-
uuid := ""
254-
if len(parts) > 0 {
255-
uuid = parts[len(parts)-1]
256-
}
257-
dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id"))
250+
id := attr(content, "id")
258251
if attr(content, "data-asset-type") == "flourish" {
252+
dfrgId := valueOr(attr(content, "data-fragment-identifier"), id)
259253
return &contenttree.Flourish{
260254
Type: contenttree.FlourishType,
261-
Id: uuid,
255+
Id: id,
262256
FlourishType: attr(content, "data-flourish-type"),
263257
LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))),
264258
Description: attr(content, "alt"),
@@ -268,48 +262,30 @@ var defaultTransformers = map[string]transformer{
268262
}
269263
return &contenttree.Link{
270264
Type: contenttree.LinkType,
271-
URL: "https://www.ft.com/content/" + uuid,
265+
URL: "https://www.ft.com/content/" + id,
272266
Title: attr(content, "dataTitle"),
273267
Children: []*contenttree.Phrasing{},
274268
}
275269
},
276270
contentType.Article: func(content *etree.Element) contenttree.Node {
277-
id := attr(content, "url")
278-
parts := strings.Split(id, "/")
279-
uuid := ""
280-
if len(parts) > 0 {
281-
uuid = parts[len(parts)-1]
282-
}
283271
return &contenttree.Link{
284272
Type: contenttree.LinkType,
285-
URL: "https://www.ft.com/content/" + uuid,
273+
URL: "https://www.ft.com/content/" + attr(content, "id"),
286274
Title: attr(content, "dataTitle"),
287275
Children: []*contenttree.Phrasing{},
288276
}
289277
},
290278
contentType.CustomCodeComponent: func(content *etree.Element) contenttree.Node {
291-
id := attr(content, "url")
292-
parts := strings.Split(id, "/")
293-
uuid := ""
294-
if len(parts) > 0 {
295-
uuid = parts[len(parts)-1]
296-
}
297279
return &contenttree.CustomCodeComponent{
298280
Type: contenttree.CustomCodeComponentType,
299-
ID: uuid,
281+
ID: attr(content, "id"),
300282
LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))),
301283
}
302284
},
303285
contentType.ClipSet: func(content *etree.Element) contenttree.Node {
304-
id := attr(content, "url")
305-
parts := strings.Split(id, "/")
306-
uuid := ""
307-
if len(parts) > 0 {
308-
uuid = parts[len(parts)-1]
309-
}
310286
return &contenttree.ClipSet{
311287
Type: contenttree.ClipSetType,
312-
ID: uuid,
288+
ID: attr(content, "id"),
313289
LayoutWidth: string(toValidClipLayoutWidth(attr(content, "data-layout-width"))),
314290
Autoplay: attr(content, "autoplay") == "true",
315291
Loop: attr(content, "loop") == "true",
@@ -320,10 +296,7 @@ var defaultTransformers = map[string]transformer{
320296
id := ""
321297
teaser := ""
322298
if link := findChild(rl, "content"); link != nil {
323-
id = generateUrl(attr(link, "type"), attr(link, "id"))
324-
teaser = textContent(link)
325-
} else if link := findChild(rl, "ft-content"); link != nil {
326-
id = attr(link, "url")
299+
id = attr(link, "id")
327300
teaser = textContent(link)
328301
}
329302
heading := findChild(rl, "recommended-title")

libraries/from-bodyxml/go/transform.go

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -55,19 +55,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
5555
return nil
5656
}
5757

58-
if t.Tag == "content" || t.Tag == "related" || t.Tag == "concept" {
59-
id := attr(t, "id")
60-
typeAttr := attr(t, "type")
61-
if id != "" {
62-
t.CreateAttr("url", generateUrl(typeAttr, id))
63-
if attr(t, "data-asset-type") != "flourish" {
64-
t.RemoveAttr("id")
65-
}
66-
}
67-
}
68-
6958
tag := t.Tag
70-
if t.Tag == "content" || t.Tag == "ft-content" {
59+
if t.Tag == "content" {
7160
for _, attr := range t.Attr {
7261
if attr.Key == "type" {
7362
tag = attr.Value
@@ -78,13 +67,14 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
7867

7968
transformer, ok := defaultTransformers[tag]
8069
if !ok {
81-
return fmt.Errorf("unknownNode transformer for tag <%s>", t.Tag)
70+
//skip unknown tags
71+
return nil
8272
}
83-
8473
switch transformed := transformer(t).(type) {
8574
case *unknownNode:
8675
{
87-
return fmt.Errorf("unknownNode div node with class '%s'", transformed.Class)
76+
//skip unknown div
77+
return nil
8878
}
8979
case *liftChildrenNode:
9080
{
@@ -100,12 +90,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
10090
{
10191
err := m.AppendChild(transformed)
10292
if err != nil {
103-
return fmt.Errorf(
104-
"failed to append transformed child of type <%s> for parent <%s>: %w",
105-
transformed.GetType(),
106-
m.GetType(),
107-
err,
108-
)
93+
//skip invalid child nodes
94+
return nil
10995
}
11096
if transformed.GetChildren() != nil {
11197
for _, child := range t.Child {
@@ -127,7 +113,8 @@ func convertToContentTree(elem etree.Token, m contenttree.Node) error {
127113
}
128114
err := m.AppendChild(tx)
129115
if err != nil {
130-
return err
116+
//skip invalid nodes
117+
return nil
131118
}
132119
}
133120
return nil

libraries/from-bodyxml/go/transform_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ func TestTransform(t *testing.T) {
1515
for _, test := range getTestCases(t) {
1616
t.Run(test.name, func(t *testing.T) {
1717
bodyTree, err := Transform(test.input)
18-
1918
if err != nil && !test.wantErr {
2019
t.Errorf("Failed with unexpected error: %v", err)
2120
}

0 commit comments

Comments
 (0)