Allow parsing HTML from []byte, passing in encoding

When parsing HTML which is not UTF-8, it must be passed to libxml2 as bytes, so that Go doesn't interpret the bytes as a UTF-8 string. Allow passing an encoding to override the default.
cptaffe · Aug 3, 2023 · 5ff7cdf · 5ff7cdf
1 parent da2d243
commit 5ff7cdf
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 20 deletions.
diff --git a/clib/clib.go b/clib/clib.go
@@ -537,22 +537,18 @@ func XMLFreeParserCtxt(ctx PtrSource) error {
 	return nil
 }
 
-func HTMLReadDoc(content, url, encoding string, opts int) (uintptr, error) {
-	// TODO: use htmlCtxReadDoc later, so we can get the error
-	ccontent := C.CString(content)
+func HTMLReadDoc(content []byte, url, encoding string, opts int) (uintptr, error) {
 	curl := C.CString(url)
 	cencoding := C.CString(encoding)
-	defer C.free(unsafe.Pointer(ccontent))
 	defer C.free(unsafe.Pointer(curl))
 	defer C.free(unsafe.Pointer(cencoding))
 
 	doc := C.htmlReadDoc(
-		(*C.xmlChar)(unsafe.Pointer(ccontent)),
+		(*C.xmlChar)(unsafe.Pointer(&content[0])),
 		curl,
 		cencoding,
 		C.int(opts),
 	)
-
 	if doc == nil {
 		return 0, errors.New("failed to parse document")
 	}

diff --git a/html.go b/html.go
@@ -11,22 +11,20 @@ import (
 	"github.com/pkg/errors"
 )
 
-// ParseHTML parses an HTML document. You can omit the options
-// argument, or you can provide one bitwise-or'ed option
-func ParseHTML(content []byte, options ...parser.HTMLOption) (types.Document, error) {
-	return ParseHTMLString(string(content), options...)
+func ParseHTMLString(content, url string, options ...parser.HTMLOption) (types.Document, error) {
+	return ParseHTML([]byte(content), url, "UTF-8", options...)
 }
 
-// ParseHTMLString parses an HTML document. You can omit the options
+// ParseHTML parses an HTML document. You can omit the options
 // argument, or you can provide one bitwise-or'ed option
-func ParseHTMLString(content string, options ...parser.HTMLOption) (types.Document, error) {
+func ParseHTML(content []byte, url, encoding string, options ...parser.HTMLOption) (types.Document, error) {
 	var option parser.HTMLOption
 	if len(options) > 0 {
 		option = options[0]
 	} else {
 		option = parser.DefaultHTMLOptions
 	}
-	docptr, err := clib.HTMLReadDoc(content, "", "", int(option))
+	docptr, err := clib.HTMLReadDoc(content, url, encoding, int(option))
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to read document")
 	}
@@ -39,11 +37,11 @@ func ParseHTMLString(content string, options ...parser.HTMLOption) (types.Docume
 
 // ParseHTMLReader parses an HTML document. You can omit the options
 // argument, or you can provide one bitwise-or'ed option
-func ParseHTMLReader(in io.Reader, options ...parser.HTMLOption) (types.Document, error) {
+func ParseHTMLReader(in io.Reader, url, encoding string, options ...parser.HTMLOption) (types.Document, error) {
 	buf := &bytes.Buffer{}
 	if _, err := buf.ReadFrom(in); err != nil {
 		return nil, errors.Wrap(err, "failed to rea from io.Reader")
 	}
 
-	return ParseHTMLString(buf.String(), options...)
+	return ParseHTML(buf.Bytes(), url, encoding, options...)
 }
diff --git a/html_test.go b/html_test.go
@@ -9,7 +9,7 @@ import (
 )
 
 func TestParseHTML(t *testing.T) {
-	doc, err := libxml2.ParseHTMLString(`<html><body><h1>Hello, World!</h1><p>Lorem Ipsum</p></body></html>`)
+	doc, err := libxml2.ParseHTMLString(`<html><body><h1>Hello, World!</h1><p>Lorem Ipsum</p></body></html>`, "")
 	if err != nil {
 		t.Errorf("Failed to parse: %s", err)
 		return

diff --git a/libxml2_example_test.go b/libxml2_example_test.go
@@ -49,12 +49,13 @@ func ExampleXML() {
 }
 
 func ExampleHTML() {
-	res, err := http.Get("http://golang.org")
+	url := "http://golang.org"
+	res, err := http.Get(url)
 	if err != nil {
 		panic("failed to get golang.org: " + err.Error())
 	}
 
-	doc, err := libxml2.ParseHTMLReader(res.Body)
+	doc, err := libxml2.ParseHTMLReader(res.Body, url, "")
 	if err != nil {
 		panic("failed to parse HTML: " + err.Error())
 	}

diff --git a/parser_test.go b/parser_test.go
@@ -287,7 +287,7 @@ func TestCommentWrapNodeIssue(t *testing.T) {
 	// should wrap comment node
 	const testHTML = "<p><!-- test --></p><!-- test --><p><!-- test --></p>"
 
-	doc, err := ParseHTMLString(testHTML, parser.HTMLParseRecover)
+	doc, err := ParseHTMLString(testHTML, "", parser.HTMLParseRecover)
 	if err != nil {
 		t.Fatalf("Got error when parsing HTML: %v", err)
 	}

diff --git a/xml_test.go b/xml_test.go
@@ -122,7 +122,7 @@ func TestRegressionGH7(t *testing.T) {
     1234
 </div>
 </body>
-</html>`)
+</html>`, "")
 
 	if !assert.NoError(t, err, "ParseHTMLString should succeed") {
 		return
-Original file line number
+Diff line change
@@ Expand Up / @@ -122,7 +122,7 @@ func TestRegressionGH7(t *testing.T) { @@
     </div>
     </body>
-    </html>`)
+    </html>`, "")
     	if !assert.NoError(t, err, "ParseHTMLString should succeed") {
     		return
@@ Expand Down @@