@@ -18,10 +18,14 @@ export class HtmlPage {
1818 * Example: `/en/getting-started/`
1919 */
2020 readonly pathname : string ;
21-
22- readonly dom : Document ;
23-
24- readonly anchors : Element [ ] ;
21+ /**
22+ * A list of all anchor elements on the page.
23+ */
24+ readonly anchors : Array < {
25+ label : string ;
26+ name : string ;
27+ href : string ;
28+ } > ;
2529 /**
2630 * A list of unique link hrefs on the page.
2731 */
@@ -46,12 +50,11 @@ export class HtmlPage {
4650 */
4751 readonly isRedirect : boolean ;
4852 /**
49- * The element containing the page's main content.
53+ * Indicates whether the page contains content.
5054 *
51- * Prefers the first `<article>` element, with a fallback to `<body>` if no article was found,
52- * and finally `null` if the page even doesn't have a body.
55+ * Set to `false` if the page even doesn't have a `<body>` element (a partial or invalid page).
5356 */
54- readonly mainContent : Element | null ;
57+ readonly hasContent : boolean ;
5558 /**
5659 * The language of the page's main content.
5760 *
@@ -71,25 +74,34 @@ export class HtmlPage {
7174
7275 constructor ( { html, href, pathname } : { html : string ; href : string ; pathname : string } ) {
7376 // Attempt to read the HTML file and parse its DOM
74- this . dom = parseDocument ( html ) ;
77+ const parser = new DocumentParser ( parseDocument ( html ) ) ;
7578 this . href = href ;
7679 this . pathname = pathname ;
7780
7881 // Provide commonly used data as properties
79- this . anchors = DomUtils . getElementsByTagName ( 'a' , this . dom , true ) ;
82+ this . anchors = DomUtils . getElementsByTagName ( 'a' , parser . dom , true ) . map ( ( el ) => ( {
83+ // Pass the strings through Buffer to allow Node to reallocate them into independent memory
84+ // instead of using slices of the original large string containing the full HTML document.
85+ //
86+ // This reduces memory usage significantly, at time of writing, 2.1Gib -> 300MiB.
87+ label : Buffer . from ( DomUtils . innerText ( el ) ) . toString ( ) ,
88+ name : el . attribs . name && Buffer . from ( el . attribs . name ) . toString ( ) ,
89+ href : el . attribs . href && Buffer . from ( el . attribs . href ) . toString ( ) ,
90+ } ) ) ;
8091
8192 // Build a list of unique link hrefs on the page
82- this . uniqueLinkHrefs = [ ...new Set ( this . anchors . map ( ( el ) => decodeURI ( el . attribs . href ) ) ) ] ;
93+ this . uniqueLinkHrefs = [ ...new Set ( this . anchors . map ( ( el ) => decodeURI ( el . href ) ) ) ] ;
8394
8495 // Build a list of hashes that can be used as URL fragments to jump to parts of the page
85- const anchorNames = this . anchors
86- . map ( ( el ) => el . attribs . name )
87- . filter ( ( name ) => name !== undefined ) ;
88- const ids = this . findAll ( ( el ) => Boolean ( el . attribs . id ) ) . map ( ( el ) => el . attribs . id ) ;
96+ const anchorNames = this . anchors . map ( ( el ) => el . name ) . filter ( ( name ) => name !== undefined ) ;
97+ const ids = parser
98+ . findAll ( ( el ) => Boolean ( el . attribs . id ) )
99+ // Same reason as above.
100+ . map ( ( el ) => Buffer . from ( el . attribs . id ) . toString ( ) ) ;
89101 this . hashes = [ ...anchorNames , ...ids ] . map ( ( name ) => `#${ name } ` ) ;
90102
91103 // Check if the page redirects somewhere else using meta refresh
92- const metaRefreshElement = this . findFirst (
104+ const metaRefreshElement = parser . findFirst (
93105 ( el ) =>
94106 el . tagName . toLowerCase ( ) === 'meta' && el . attribs [ 'http-equiv' ] ?. toLowerCase ( ) === 'refresh'
95107 ) ;
@@ -99,22 +111,24 @@ export class HtmlPage {
99111 this . isRedirect = Boolean ( this . redirectTargetUrl ) ;
100112
101113 // Get the page's canonical URL (if any)
102- const linkCanonicalElement = this . findFirst (
114+ const linkCanonicalElement = parser . findFirst (
103115 ( el ) =>
104116 el . tagName . toLowerCase ( ) === 'link' && el . attribs [ 'rel' ] ?. toLowerCase ( ) === 'canonical'
105117 ) ;
106118 this . canonicalUrl =
107119 ( linkCanonicalElement && new URL ( linkCanonicalElement . attribs [ 'href' ] ) ) || null ;
108120
109121 // Attempt to find the page's main content element
110- this . mainContent =
111- this . findFirst ( ( el ) => el . tagName . toLowerCase ( ) === 'main' ) ||
112- this . findFirst ( ( el ) => el . tagName . toLowerCase ( ) === 'body' ) ;
122+ const mainContent =
123+ parser . findFirst ( ( el ) => el . tagName . toLowerCase ( ) === 'main' ) ||
124+ parser . findFirst ( ( el ) => el . tagName . toLowerCase ( ) === 'body' ) ;
125+
126+ this . hasContent = Boolean ( mainContent ) ;
113127
114128 // Attempt to determine the main content language by traversing the tree upwards
115129 // until we find an element with a `lang` attribute
116130 const mainContentParentWithLang =
117- this . mainContent && this . findParent ( this . mainContent , ( el ) => Boolean ( el . attribs ?. lang ) ) ;
131+ mainContent && parser . findParent ( mainContent , ( el ) => Boolean ( el . attribs ?. lang ) ) ;
118132 this . mainContentLang = mainContentParentWithLang ?. attribs . lang || null ;
119133
120134 // Attempt to determine the page's pathname-based language
@@ -125,23 +139,6 @@ export class HtmlPage {
125139 Boolean ( this . pathnameLang ) && this . pathnameLang !== 'en' && this . mainContentLang === 'en' ;
126140 }
127141
128- findFirst ( test : ( elem : Element ) => boolean ) {
129- return DomUtils . findOne ( test , this . dom . children ) ;
130- }
131-
132- findAll ( test : ( elem : Element ) => boolean ) {
133- return DomUtils . findAll ( test , this . dom . children ) ;
134- }
135-
136- findParent ( start : Element , test : ( elem : Element ) => boolean ) {
137- let el : Element | null = start ;
138- while ( el ) {
139- if ( test ( el ) ) return el ;
140- el = DomUtils . getParent ( el ) as Element ;
141- }
142- return null ;
143- }
144-
145142 /**
146143 * Determines the URL pathname that should be used to link to this page
147144 * from a page with the given source language.
@@ -163,3 +160,24 @@ export class HtmlPage {
163160 if ( firstPathPart . match ( / ^ [ a - z ] { 2 } ( - [ a - z A - Z ] { 2 } ) ? $ / ) ) return firstPathPart ;
164161 }
165162}
163+
164+ class DocumentParser {
165+ constructor ( public readonly dom : Document ) { }
166+
167+ findFirst ( test : ( elem : Element ) => boolean ) {
168+ return DomUtils . findOne ( test , this . dom . children ) ;
169+ }
170+
171+ findAll ( test : ( elem : Element ) => boolean ) {
172+ return DomUtils . findAll ( test , this . dom . children ) ;
173+ }
174+
175+ findParent ( start : Element , test : ( elem : Element ) => boolean ) {
176+ let el : Element | null = start ;
177+ while ( el ) {
178+ if ( test ( el ) ) return el ;
179+ el = DomUtils . getParent ( el ) as Element ;
180+ }
181+ return null ;
182+ }
183+ }
0 commit comments