Skip to content

Commit

Permalink
Allow parsing HTML from []byte, passing in encoding
Browse files Browse the repository at this point in the history
When parsing HTML which is not UTF-8, it must be passed to libxml2 as
bytes, so that Go doesn't interpret the bytes as a UTF-8 string. Allow
passing an encoding to override the default.
  • Loading branch information
cptaffe committed Aug 3, 2023
1 parent da2d243 commit 5ff7cdf
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 20 deletions.
8 changes: 2 additions & 6 deletions clib/clib.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,22 +537,18 @@ func XMLFreeParserCtxt(ctx PtrSource) error {
return nil
}

func HTMLReadDoc(content, url, encoding string, opts int) (uintptr, error) {
// TODO: use htmlCtxReadDoc later, so we can get the error
ccontent := C.CString(content)
func HTMLReadDoc(content []byte, url, encoding string, opts int) (uintptr, error) {
curl := C.CString(url)
cencoding := C.CString(encoding)
defer C.free(unsafe.Pointer(ccontent))
defer C.free(unsafe.Pointer(curl))
defer C.free(unsafe.Pointer(cencoding))

doc := C.htmlReadDoc(
(*C.xmlChar)(unsafe.Pointer(ccontent)),
(*C.xmlChar)(unsafe.Pointer(&content[0])),
curl,
cencoding,
C.int(opts),
)

if doc == nil {
return 0, errors.New("failed to parse document")
}
Expand Down
16 changes: 7 additions & 9 deletions html.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,20 @@ import (
"github.com/pkg/errors"
)

// ParseHTML parses an HTML document. You can omit the options
// argument, or you can provide one bitwise-or'ed option
func ParseHTML(content []byte, options ...parser.HTMLOption) (types.Document, error) {
return ParseHTMLString(string(content), options...)
func ParseHTMLString(content, url string, options ...parser.HTMLOption) (types.Document, error) {
return ParseHTML([]byte(content), url, "UTF-8", options...)
}

// ParseHTMLString parses an HTML document. You can omit the options
// ParseHTML parses an HTML document. You can omit the options
// argument, or you can provide one bitwise-or'ed option
func ParseHTMLString(content string, options ...parser.HTMLOption) (types.Document, error) {
func ParseHTML(content []byte, url, encoding string, options ...parser.HTMLOption) (types.Document, error) {
var option parser.HTMLOption
if len(options) > 0 {
option = options[0]
} else {
option = parser.DefaultHTMLOptions
}
docptr, err := clib.HTMLReadDoc(content, "", "", int(option))
docptr, err := clib.HTMLReadDoc(content, url, encoding, int(option))
if err != nil {
return nil, errors.Wrap(err, "failed to read document")
}
Expand All @@ -39,11 +37,11 @@ func ParseHTMLString(content string, options ...parser.HTMLOption) (types.Docume

// ParseHTMLReader parses an HTML document. You can omit the options
// argument, or you can provide one bitwise-or'ed option
func ParseHTMLReader(in io.Reader, options ...parser.HTMLOption) (types.Document, error) {
func ParseHTMLReader(in io.Reader, url, encoding string, options ...parser.HTMLOption) (types.Document, error) {
buf := &bytes.Buffer{}
if _, err := buf.ReadFrom(in); err != nil {
return nil, errors.Wrap(err, "failed to rea from io.Reader")
}

return ParseHTMLString(buf.String(), options...)
return ParseHTML(buf.Bytes(), url, encoding, options...)
}
2 changes: 1 addition & 1 deletion html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
)

func TestParseHTML(t *testing.T) {
doc, err := libxml2.ParseHTMLString(`<html><body><h1>Hello, World!</h1><p>Lorem Ipsum</p></body></html>`)
doc, err := libxml2.ParseHTMLString(`<html><body><h1>Hello, World!</h1><p>Lorem Ipsum</p></body></html>`, "")
if err != nil {
t.Errorf("Failed to parse: %s", err)
return
Expand Down
5 changes: 3 additions & 2 deletions libxml2_example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,13 @@ func ExampleXML() {
}

func ExampleHTML() {
res, err := http.Get("http://golang.org")
url := "http://golang.org"
res, err := http.Get(url)
if err != nil {
panic("failed to get golang.org: " + err.Error())
}

doc, err := libxml2.ParseHTMLReader(res.Body)
doc, err := libxml2.ParseHTMLReader(res.Body, url, "")
if err != nil {
panic("failed to parse HTML: " + err.Error())
}
Expand Down
2 changes: 1 addition & 1 deletion parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ func TestCommentWrapNodeIssue(t *testing.T) {
// should wrap comment node
const testHTML = "<p><!-- test --></p><!-- test --><p><!-- test --></p>"

doc, err := ParseHTMLString(testHTML, parser.HTMLParseRecover)
doc, err := ParseHTMLString(testHTML, "", parser.HTMLParseRecover)
if err != nil {
t.Fatalf("Got error when parsing HTML: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion xml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func TestRegressionGH7(t *testing.T) {
1234
</div>
</body>
</html>`)
</html>`, "")

if !assert.NoError(t, err, "ParseHTMLString should succeed") {
return
Expand Down

0 comments on commit 5ff7cdf

Please sign in to comment.