
This upgrades all of libgo other than the runtime package to the Go 1.4 release. In Go 1.4 much of the runtime was rewritten into Go. Merging that code will take more time and will not change the API, so I'm putting it off for now. There are a few runtime changes anyhow, to accomodate other packages that rely on minor modifications to the runtime support. The compiler changes slightly to add a one-bit flag to each type descriptor kind that is stored directly in an interface, which for gccgo is currently only pointer types. Another one-bit flag (gcprog) is reserved because it is used by the gc compiler, but gccgo does not currently use it. There is another error check in the compiler since I ran across it during testing. gotools/: * Makefile.am (go_cmd_go_files): Sort entries. Add generate.go. * Makefile.in: Rebuild. From-SVN: r219627
1945 lines
43 KiB
Go
1945 lines
43 KiB
Go
// Copyright 2009 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package xml implements a simple XML 1.0 parser that
|
|
// understands XML name spaces.
|
|
package xml
|
|
|
|
// References:
|
|
// Annotated XML spec: http://www.xml.com/axml/testaxml.htm
|
|
// XML name spaces: http://www.w3.org/TR/REC-xml-names/
|
|
|
|
// TODO(rsc):
|
|
// Test error handling.
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// A SyntaxError represents a syntax error in the XML input stream.
|
|
type SyntaxError struct {
|
|
Msg string
|
|
Line int
|
|
}
|
|
|
|
func (e *SyntaxError) Error() string {
|
|
return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
|
|
}
|
|
|
|
// A Name represents an XML name (Local) annotated
|
|
// with a name space identifier (Space).
|
|
// In tokens returned by Decoder.Token, the Space identifier
|
|
// is given as a canonical URL, not the short prefix used
|
|
// in the document being parsed.
|
|
type Name struct {
|
|
Space, Local string
|
|
}
|
|
|
|
// An Attr represents an attribute in an XML element (Name=Value).
|
|
type Attr struct {
|
|
Name Name
|
|
Value string
|
|
}
|
|
|
|
// A Token is an interface holding one of the token types:
|
|
// StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
|
|
type Token interface{}
|
|
|
|
// A StartElement represents an XML start element.
|
|
type StartElement struct {
|
|
Name Name
|
|
Attr []Attr
|
|
}
|
|
|
|
func (e StartElement) Copy() StartElement {
|
|
attrs := make([]Attr, len(e.Attr))
|
|
copy(attrs, e.Attr)
|
|
e.Attr = attrs
|
|
return e
|
|
}
|
|
|
|
// End returns the corresponding XML end element.
|
|
func (e StartElement) End() EndElement {
|
|
return EndElement{e.Name}
|
|
}
|
|
|
|
// An EndElement represents an XML end element.
|
|
type EndElement struct {
|
|
Name Name
|
|
}
|
|
|
|
// A CharData represents XML character data (raw text),
|
|
// in which XML escape sequences have been replaced by
|
|
// the characters they represent.
|
|
type CharData []byte
|
|
|
|
func makeCopy(b []byte) []byte {
|
|
b1 := make([]byte, len(b))
|
|
copy(b1, b)
|
|
return b1
|
|
}
|
|
|
|
func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
|
|
|
|
// A Comment represents an XML comment of the form <!--comment-->.
|
|
// The bytes do not include the <!-- and --> comment markers.
|
|
type Comment []byte
|
|
|
|
func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
|
|
|
|
// A ProcInst represents an XML processing instruction of the form <?target inst?>
|
|
type ProcInst struct {
|
|
Target string
|
|
Inst []byte
|
|
}
|
|
|
|
func (p ProcInst) Copy() ProcInst {
|
|
p.Inst = makeCopy(p.Inst)
|
|
return p
|
|
}
|
|
|
|
// A Directive represents an XML directive of the form <!text>.
|
|
// The bytes do not include the <! and > markers.
|
|
type Directive []byte
|
|
|
|
func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
|
|
|
|
// CopyToken returns a copy of a Token.
|
|
func CopyToken(t Token) Token {
|
|
switch v := t.(type) {
|
|
case CharData:
|
|
return v.Copy()
|
|
case Comment:
|
|
return v.Copy()
|
|
case Directive:
|
|
return v.Copy()
|
|
case ProcInst:
|
|
return v.Copy()
|
|
case StartElement:
|
|
return v.Copy()
|
|
}
|
|
return t
|
|
}
|
|
|
|
// A Decoder represents an XML parser reading a particular input stream.
|
|
// The parser assumes that its input is encoded in UTF-8.
|
|
type Decoder struct {
|
|
// Strict defaults to true, enforcing the requirements
|
|
// of the XML specification.
|
|
// If set to false, the parser allows input containing common
|
|
// mistakes:
|
|
// * If an element is missing an end tag, the parser invents
|
|
// end tags as necessary to keep the return values from Token
|
|
// properly balanced.
|
|
// * In attribute values and character data, unknown or malformed
|
|
// character entities (sequences beginning with &) are left alone.
|
|
//
|
|
// Setting:
|
|
//
|
|
// d.Strict = false;
|
|
// d.AutoClose = HTMLAutoClose;
|
|
// d.Entity = HTMLEntity
|
|
//
|
|
// creates a parser that can handle typical HTML.
|
|
//
|
|
// Strict mode does not enforce the requirements of the XML name spaces TR.
|
|
// In particular it does not reject name space tags using undefined prefixes.
|
|
// Such tags are recorded with the unknown prefix as the name space URL.
|
|
Strict bool
|
|
|
|
// When Strict == false, AutoClose indicates a set of elements to
|
|
// consider closed immediately after they are opened, regardless
|
|
// of whether an end element is present.
|
|
AutoClose []string
|
|
|
|
// Entity can be used to map non-standard entity names to string replacements.
|
|
// The parser behaves as if these standard mappings are present in the map,
|
|
// regardless of the actual map content:
|
|
//
|
|
// "lt": "<",
|
|
// "gt": ">",
|
|
// "amp": "&",
|
|
// "apos": "'",
|
|
// "quot": `"`,
|
|
Entity map[string]string
|
|
|
|
// CharsetReader, if non-nil, defines a function to generate
|
|
// charset-conversion readers, converting from the provided
|
|
// non-UTF-8 charset into UTF-8. If CharsetReader is nil or
|
|
// returns an error, parsing stops with an error. One of the
|
|
// the CharsetReader's result values must be non-nil.
|
|
CharsetReader func(charset string, input io.Reader) (io.Reader, error)
|
|
|
|
// DefaultSpace sets the default name space used for unadorned tags,
|
|
// as if the entire XML stream were wrapped in an element containing
|
|
// the attribute xmlns="DefaultSpace".
|
|
DefaultSpace string
|
|
|
|
r io.ByteReader
|
|
buf bytes.Buffer
|
|
saved *bytes.Buffer
|
|
stk *stack
|
|
free *stack
|
|
needClose bool
|
|
toClose Name
|
|
nextToken Token
|
|
nextByte int
|
|
ns map[string]string
|
|
err error
|
|
line int
|
|
offset int64
|
|
unmarshalDepth int
|
|
}
|
|
|
|
// NewDecoder creates a new XML parser reading from r.
|
|
// If r does not implement io.ByteReader, NewDecoder will
|
|
// do its own buffering.
|
|
func NewDecoder(r io.Reader) *Decoder {
|
|
d := &Decoder{
|
|
ns: make(map[string]string),
|
|
nextByte: -1,
|
|
line: 1,
|
|
Strict: true,
|
|
}
|
|
d.switchToReader(r)
|
|
return d
|
|
}
|
|
|
|
// Token returns the next XML token in the input stream.
|
|
// At the end of the input stream, Token returns nil, io.EOF.
|
|
//
|
|
// Slices of bytes in the returned token data refer to the
|
|
// parser's internal buffer and remain valid only until the next
|
|
// call to Token. To acquire a copy of the bytes, call CopyToken
|
|
// or the token's Copy method.
|
|
//
|
|
// Token expands self-closing elements such as <br/>
|
|
// into separate start and end elements returned by successive calls.
|
|
//
|
|
// Token guarantees that the StartElement and EndElement
|
|
// tokens it returns are properly nested and matched:
|
|
// if Token encounters an unexpected end element,
|
|
// it will return an error.
|
|
//
|
|
// Token implements XML name spaces as described by
|
|
// http://www.w3.org/TR/REC-xml-names/. Each of the
|
|
// Name structures contained in the Token has the Space
|
|
// set to the URL identifying its name space when known.
|
|
// If Token encounters an unrecognized name space prefix,
|
|
// it uses the prefix as the Space rather than report an error.
|
|
func (d *Decoder) Token() (t Token, err error) {
|
|
if d.stk != nil && d.stk.kind == stkEOF {
|
|
err = io.EOF
|
|
return
|
|
}
|
|
if d.nextToken != nil {
|
|
t = d.nextToken
|
|
d.nextToken = nil
|
|
} else if t, err = d.rawToken(); err != nil {
|
|
return
|
|
}
|
|
|
|
if !d.Strict {
|
|
if t1, ok := d.autoClose(t); ok {
|
|
d.nextToken = t
|
|
t = t1
|
|
}
|
|
}
|
|
switch t1 := t.(type) {
|
|
case StartElement:
|
|
// In XML name spaces, the translations listed in the
|
|
// attributes apply to the element name and
|
|
// to the other attribute names, so process
|
|
// the translations first.
|
|
for _, a := range t1.Attr {
|
|
if a.Name.Space == "xmlns" {
|
|
v, ok := d.ns[a.Name.Local]
|
|
d.pushNs(a.Name.Local, v, ok)
|
|
d.ns[a.Name.Local] = a.Value
|
|
}
|
|
if a.Name.Space == "" && a.Name.Local == "xmlns" {
|
|
// Default space for untagged names
|
|
v, ok := d.ns[""]
|
|
d.pushNs("", v, ok)
|
|
d.ns[""] = a.Value
|
|
}
|
|
}
|
|
|
|
d.translate(&t1.Name, true)
|
|
for i := range t1.Attr {
|
|
d.translate(&t1.Attr[i].Name, false)
|
|
}
|
|
d.pushElement(t1.Name)
|
|
t = t1
|
|
|
|
case EndElement:
|
|
d.translate(&t1.Name, true)
|
|
if !d.popElement(&t1) {
|
|
return nil, d.err
|
|
}
|
|
t = t1
|
|
}
|
|
return
|
|
}
|
|
|
|
const xmlURL = "http://www.w3.org/XML/1998/namespace"
|
|
|
|
// Apply name space translation to name n.
|
|
// The default name space (for Space=="")
|
|
// applies only to element names, not to attribute names.
|
|
func (d *Decoder) translate(n *Name, isElementName bool) {
|
|
switch {
|
|
case n.Space == "xmlns":
|
|
return
|
|
case n.Space == "" && !isElementName:
|
|
return
|
|
case n.Space == "xml":
|
|
n.Space = xmlURL
|
|
case n.Space == "" && n.Local == "xmlns":
|
|
return
|
|
}
|
|
if v, ok := d.ns[n.Space]; ok {
|
|
n.Space = v
|
|
} else if n.Space == "" {
|
|
n.Space = d.DefaultSpace
|
|
}
|
|
}
|
|
|
|
func (d *Decoder) switchToReader(r io.Reader) {
|
|
// Get efficient byte at a time reader.
|
|
// Assume that if reader has its own
|
|
// ReadByte, it's efficient enough.
|
|
// Otherwise, use bufio.
|
|
if rb, ok := r.(io.ByteReader); ok {
|
|
d.r = rb
|
|
} else {
|
|
d.r = bufio.NewReader(r)
|
|
}
|
|
}
|
|
|
|
// Parsing state - stack holds old name space translations
|
|
// and the current set of open elements. The translations to pop when
|
|
// ending a given tag are *below* it on the stack, which is
|
|
// more work but forced on us by XML.
|
|
type stack struct {
|
|
next *stack
|
|
kind int
|
|
name Name
|
|
ok bool
|
|
}
|
|
|
|
const (
|
|
stkStart = iota
|
|
stkNs
|
|
stkEOF
|
|
)
|
|
|
|
func (d *Decoder) push(kind int) *stack {
|
|
s := d.free
|
|
if s != nil {
|
|
d.free = s.next
|
|
} else {
|
|
s = new(stack)
|
|
}
|
|
s.next = d.stk
|
|
s.kind = kind
|
|
d.stk = s
|
|
return s
|
|
}
|
|
|
|
func (d *Decoder) pop() *stack {
|
|
s := d.stk
|
|
if s != nil {
|
|
d.stk = s.next
|
|
s.next = d.free
|
|
d.free = s
|
|
}
|
|
return s
|
|
}
|
|
|
|
// Record that after the current element is finished
|
|
// (that element is already pushed on the stack)
|
|
// Token should return EOF until popEOF is called.
|
|
func (d *Decoder) pushEOF() {
|
|
// Walk down stack to find Start.
|
|
// It might not be the top, because there might be stkNs
|
|
// entries above it.
|
|
start := d.stk
|
|
for start.kind != stkStart {
|
|
start = start.next
|
|
}
|
|
// The stkNs entries below a start are associated with that
|
|
// element too; skip over them.
|
|
for start.next != nil && start.next.kind == stkNs {
|
|
start = start.next
|
|
}
|
|
s := d.free
|
|
if s != nil {
|
|
d.free = s.next
|
|
} else {
|
|
s = new(stack)
|
|
}
|
|
s.kind = stkEOF
|
|
s.next = start.next
|
|
start.next = s
|
|
}
|
|
|
|
// Undo a pushEOF.
|
|
// The element must have been finished, so the EOF should be at the top of the stack.
|
|
func (d *Decoder) popEOF() bool {
|
|
if d.stk == nil || d.stk.kind != stkEOF {
|
|
return false
|
|
}
|
|
d.pop()
|
|
return true
|
|
}
|
|
|
|
// Record that we are starting an element with the given name.
|
|
func (d *Decoder) pushElement(name Name) {
|
|
s := d.push(stkStart)
|
|
s.name = name
|
|
}
|
|
|
|
// Record that we are changing the value of ns[local].
|
|
// The old value is url, ok.
|
|
func (d *Decoder) pushNs(local string, url string, ok bool) {
|
|
s := d.push(stkNs)
|
|
s.name.Local = local
|
|
s.name.Space = url
|
|
s.ok = ok
|
|
}
|
|
|
|
// Creates a SyntaxError with the current line number.
|
|
func (d *Decoder) syntaxError(msg string) error {
|
|
return &SyntaxError{Msg: msg, Line: d.line}
|
|
}
|
|
|
|
// Record that we are ending an element with the given name.
|
|
// The name must match the record at the top of the stack,
|
|
// which must be a pushElement record.
|
|
// After popping the element, apply any undo records from
|
|
// the stack to restore the name translations that existed
|
|
// before we saw this element.
|
|
func (d *Decoder) popElement(t *EndElement) bool {
|
|
s := d.pop()
|
|
name := t.Name
|
|
switch {
|
|
case s == nil || s.kind != stkStart:
|
|
d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
|
|
return false
|
|
case s.name.Local != name.Local:
|
|
if !d.Strict {
|
|
d.needClose = true
|
|
d.toClose = t.Name
|
|
t.Name = s.name
|
|
return true
|
|
}
|
|
d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
|
|
return false
|
|
case s.name.Space != name.Space:
|
|
d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
|
|
"closed by </" + name.Local + "> in space " + name.Space)
|
|
return false
|
|
}
|
|
|
|
// Pop stack until a Start or EOF is on the top, undoing the
|
|
// translations that were associated with the element we just closed.
|
|
for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF {
|
|
s := d.pop()
|
|
if s.ok {
|
|
d.ns[s.name.Local] = s.name.Space
|
|
} else {
|
|
delete(d.ns, s.name.Local)
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// If the top element on the stack is autoclosing and
|
|
// t is not the end tag, invent the end tag.
|
|
func (d *Decoder) autoClose(t Token) (Token, bool) {
|
|
if d.stk == nil || d.stk.kind != stkStart {
|
|
return nil, false
|
|
}
|
|
name := strings.ToLower(d.stk.name.Local)
|
|
for _, s := range d.AutoClose {
|
|
if strings.ToLower(s) == name {
|
|
// This one should be auto closed if t doesn't close it.
|
|
et, ok := t.(EndElement)
|
|
if !ok || et.Name.Local != name {
|
|
return EndElement{d.stk.name}, true
|
|
}
|
|
break
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method")
|
|
|
|
// RawToken is like Token but does not verify that
|
|
// start and end elements match and does not translate
|
|
// name space prefixes to their corresponding URLs.
|
|
func (d *Decoder) RawToken() (Token, error) {
|
|
if d.unmarshalDepth > 0 {
|
|
return nil, errRawToken
|
|
}
|
|
return d.rawToken()
|
|
}
|
|
|
|
func (d *Decoder) rawToken() (Token, error) {
|
|
if d.err != nil {
|
|
return nil, d.err
|
|
}
|
|
if d.needClose {
|
|
// The last element we read was self-closing and
|
|
// we returned just the StartElement half.
|
|
// Return the EndElement half now.
|
|
d.needClose = false
|
|
return EndElement{d.toClose}, nil
|
|
}
|
|
|
|
b, ok := d.getc()
|
|
if !ok {
|
|
return nil, d.err
|
|
}
|
|
|
|
if b != '<' {
|
|
// Text section.
|
|
d.ungetc(b)
|
|
data := d.text(-1, false)
|
|
if data == nil {
|
|
return nil, d.err
|
|
}
|
|
return CharData(data), nil
|
|
}
|
|
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
switch b {
|
|
case '/':
|
|
// </: End element
|
|
var name Name
|
|
if name, ok = d.nsname(); !ok {
|
|
if d.err == nil {
|
|
d.err = d.syntaxError("expected element name after </")
|
|
}
|
|
return nil, d.err
|
|
}
|
|
d.space()
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != '>' {
|
|
d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
|
|
return nil, d.err
|
|
}
|
|
return EndElement{name}, nil
|
|
|
|
case '?':
|
|
// <?: Processing instruction.
|
|
// TODO(rsc): Should parse the <?xml declaration to make sure the version is 1.0.
|
|
var target string
|
|
if target, ok = d.name(); !ok {
|
|
if d.err == nil {
|
|
d.err = d.syntaxError("expected target name after <?")
|
|
}
|
|
return nil, d.err
|
|
}
|
|
d.space()
|
|
d.buf.Reset()
|
|
var b0 byte
|
|
for {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
d.buf.WriteByte(b)
|
|
if b0 == '?' && b == '>' {
|
|
break
|
|
}
|
|
b0 = b
|
|
}
|
|
data := d.buf.Bytes()
|
|
data = data[0 : len(data)-2] // chop ?>
|
|
|
|
if target == "xml" {
|
|
enc := procInstEncoding(string(data))
|
|
if enc != "" && enc != "utf-8" && enc != "UTF-8" {
|
|
if d.CharsetReader == nil {
|
|
d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
|
|
return nil, d.err
|
|
}
|
|
newr, err := d.CharsetReader(enc, d.r.(io.Reader))
|
|
if err != nil {
|
|
d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
|
|
return nil, d.err
|
|
}
|
|
if newr == nil {
|
|
panic("CharsetReader returned a nil Reader for charset " + enc)
|
|
}
|
|
d.switchToReader(newr)
|
|
}
|
|
}
|
|
return ProcInst{target, data}, nil
|
|
|
|
case '!':
|
|
// <!: Maybe comment, maybe CDATA.
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
switch b {
|
|
case '-': // <!-
|
|
// Probably <!-- for a comment.
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != '-' {
|
|
d.err = d.syntaxError("invalid sequence <!- not part of <!--")
|
|
return nil, d.err
|
|
}
|
|
// Look for terminator.
|
|
d.buf.Reset()
|
|
var b0, b1 byte
|
|
for {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
d.buf.WriteByte(b)
|
|
if b0 == '-' && b1 == '-' && b == '>' {
|
|
break
|
|
}
|
|
b0, b1 = b1, b
|
|
}
|
|
data := d.buf.Bytes()
|
|
data = data[0 : len(data)-3] // chop -->
|
|
return Comment(data), nil
|
|
|
|
case '[': // <![
|
|
// Probably <![CDATA[.
|
|
for i := 0; i < 6; i++ {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != "CDATA["[i] {
|
|
d.err = d.syntaxError("invalid <![ sequence")
|
|
return nil, d.err
|
|
}
|
|
}
|
|
// Have <![CDATA[. Read text until ]]>.
|
|
data := d.text(-1, true)
|
|
if data == nil {
|
|
return nil, d.err
|
|
}
|
|
return CharData(data), nil
|
|
}
|
|
|
|
// Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
|
|
// We don't care, but accumulate for caller. Quoted angle
|
|
// brackets do not count for nesting.
|
|
d.buf.Reset()
|
|
d.buf.WriteByte(b)
|
|
inquote := uint8(0)
|
|
depth := 0
|
|
for {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if inquote == 0 && b == '>' && depth == 0 {
|
|
break
|
|
}
|
|
HandleB:
|
|
d.buf.WriteByte(b)
|
|
switch {
|
|
case b == inquote:
|
|
inquote = 0
|
|
|
|
case inquote != 0:
|
|
// in quotes, no special action
|
|
|
|
case b == '\'' || b == '"':
|
|
inquote = b
|
|
|
|
case b == '>' && inquote == 0:
|
|
depth--
|
|
|
|
case b == '<' && inquote == 0:
|
|
// Look for <!-- to begin comment.
|
|
s := "!--"
|
|
for i := 0; i < len(s); i++ {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != s[i] {
|
|
for j := 0; j < i; j++ {
|
|
d.buf.WriteByte(s[j])
|
|
}
|
|
depth++
|
|
goto HandleB
|
|
}
|
|
}
|
|
|
|
// Remove < that was written above.
|
|
d.buf.Truncate(d.buf.Len() - 1)
|
|
|
|
// Look for terminator.
|
|
var b0, b1 byte
|
|
for {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b0 == '-' && b1 == '-' && b == '>' {
|
|
break
|
|
}
|
|
b0, b1 = b1, b
|
|
}
|
|
}
|
|
}
|
|
return Directive(d.buf.Bytes()), nil
|
|
}
|
|
|
|
// Must be an open element like <a href="foo">
|
|
d.ungetc(b)
|
|
|
|
var (
|
|
name Name
|
|
empty bool
|
|
attr []Attr
|
|
)
|
|
if name, ok = d.nsname(); !ok {
|
|
if d.err == nil {
|
|
d.err = d.syntaxError("expected element name after <")
|
|
}
|
|
return nil, d.err
|
|
}
|
|
|
|
attr = make([]Attr, 0, 4)
|
|
for {
|
|
d.space()
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b == '/' {
|
|
empty = true
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != '>' {
|
|
d.err = d.syntaxError("expected /> in element")
|
|
return nil, d.err
|
|
}
|
|
break
|
|
}
|
|
if b == '>' {
|
|
break
|
|
}
|
|
d.ungetc(b)
|
|
|
|
n := len(attr)
|
|
if n >= cap(attr) {
|
|
nattr := make([]Attr, n, 2*cap(attr))
|
|
copy(nattr, attr)
|
|
attr = nattr
|
|
}
|
|
attr = attr[0 : n+1]
|
|
a := &attr[n]
|
|
if a.Name, ok = d.nsname(); !ok {
|
|
if d.err == nil {
|
|
d.err = d.syntaxError("expected attribute name in element")
|
|
}
|
|
return nil, d.err
|
|
}
|
|
d.space()
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil, d.err
|
|
}
|
|
if b != '=' {
|
|
if d.Strict {
|
|
d.err = d.syntaxError("attribute name without = in element")
|
|
return nil, d.err
|
|
} else {
|
|
d.ungetc(b)
|
|
a.Value = a.Name.Local
|
|
}
|
|
} else {
|
|
d.space()
|
|
data := d.attrval()
|
|
if data == nil {
|
|
return nil, d.err
|
|
}
|
|
a.Value = string(data)
|
|
}
|
|
}
|
|
if empty {
|
|
d.needClose = true
|
|
d.toClose = name
|
|
}
|
|
return StartElement{name, attr}, nil
|
|
}
|
|
|
|
func (d *Decoder) attrval() []byte {
|
|
b, ok := d.mustgetc()
|
|
if !ok {
|
|
return nil
|
|
}
|
|
// Handle quoted attribute values
|
|
if b == '"' || b == '\'' {
|
|
return d.text(int(b), false)
|
|
}
|
|
// Handle unquoted attribute values for strict parsers
|
|
if d.Strict {
|
|
d.err = d.syntaxError("unquoted or missing attribute value in element")
|
|
return nil
|
|
}
|
|
// Handle unquoted attribute values for unstrict parsers
|
|
d.ungetc(b)
|
|
d.buf.Reset()
|
|
for {
|
|
b, ok = d.mustgetc()
|
|
if !ok {
|
|
return nil
|
|
}
|
|
// http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
|
|
if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
|
|
'0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
|
|
d.buf.WriteByte(b)
|
|
} else {
|
|
d.ungetc(b)
|
|
break
|
|
}
|
|
}
|
|
return d.buf.Bytes()
|
|
}
|
|
|
|
// Skip spaces if any
|
|
func (d *Decoder) space() {
|
|
for {
|
|
b, ok := d.getc()
|
|
if !ok {
|
|
return
|
|
}
|
|
switch b {
|
|
case ' ', '\r', '\n', '\t':
|
|
default:
|
|
d.ungetc(b)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// Read a single byte.
|
|
// If there is no byte to read, return ok==false
|
|
// and leave the error in d.err.
|
|
// Maintain line number.
|
|
func (d *Decoder) getc() (b byte, ok bool) {
|
|
if d.err != nil {
|
|
return 0, false
|
|
}
|
|
if d.nextByte >= 0 {
|
|
b = byte(d.nextByte)
|
|
d.nextByte = -1
|
|
} else {
|
|
b, d.err = d.r.ReadByte()
|
|
if d.err != nil {
|
|
return 0, false
|
|
}
|
|
if d.saved != nil {
|
|
d.saved.WriteByte(b)
|
|
}
|
|
}
|
|
if b == '\n' {
|
|
d.line++
|
|
}
|
|
d.offset++
|
|
return b, true
|
|
}
|
|
|
|
// InputOffset returns the input stream byte offset of the current decoder position.
|
|
// The offset gives the location of the end of the most recently returned token
|
|
// and the beginning of the next token.
|
|
func (d *Decoder) InputOffset() int64 {
|
|
return d.offset
|
|
}
|
|
|
|
// Return saved offset.
|
|
// If we did ungetc (nextByte >= 0), have to back up one.
|
|
func (d *Decoder) savedOffset() int {
|
|
n := d.saved.Len()
|
|
if d.nextByte >= 0 {
|
|
n--
|
|
}
|
|
return n
|
|
}
|
|
|
|
// Must read a single byte.
|
|
// If there is no byte to read,
|
|
// set d.err to SyntaxError("unexpected EOF")
|
|
// and return ok==false
|
|
func (d *Decoder) mustgetc() (b byte, ok bool) {
|
|
if b, ok = d.getc(); !ok {
|
|
if d.err == io.EOF {
|
|
d.err = d.syntaxError("unexpected EOF")
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// Unread a single byte.
|
|
func (d *Decoder) ungetc(b byte) {
|
|
if b == '\n' {
|
|
d.line--
|
|
}
|
|
d.nextByte = int(b)
|
|
d.offset--
|
|
}
|
|
|
|
var entity = map[string]int{
|
|
"lt": '<',
|
|
"gt": '>',
|
|
"amp": '&',
|
|
"apos": '\'',
|
|
"quot": '"',
|
|
}
|
|
|
|
// Read plain text section (XML calls it character data).
|
|
// If quote >= 0, we are in a quoted string and need to find the matching quote.
|
|
// If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
|
|
// On failure return nil and leave the error in d.err.
|
|
func (d *Decoder) text(quote int, cdata bool) []byte {
|
|
var b0, b1 byte
|
|
var trunc int
|
|
d.buf.Reset()
|
|
Input:
|
|
for {
|
|
b, ok := d.getc()
|
|
if !ok {
|
|
if cdata {
|
|
if d.err == io.EOF {
|
|
d.err = d.syntaxError("unexpected EOF in CDATA section")
|
|
}
|
|
return nil
|
|
}
|
|
break Input
|
|
}
|
|
|
|
// <![CDATA[ section ends with ]]>.
|
|
// It is an error for ]]> to appear in ordinary text.
|
|
if b0 == ']' && b1 == ']' && b == '>' {
|
|
if cdata {
|
|
trunc = 2
|
|
break Input
|
|
}
|
|
d.err = d.syntaxError("unescaped ]]> not in CDATA section")
|
|
return nil
|
|
}
|
|
|
|
// Stop reading text if we see a <.
|
|
if b == '<' && !cdata {
|
|
if quote >= 0 {
|
|
d.err = d.syntaxError("unescaped < inside quoted string")
|
|
return nil
|
|
}
|
|
d.ungetc('<')
|
|
break Input
|
|
}
|
|
if quote >= 0 && b == byte(quote) {
|
|
break Input
|
|
}
|
|
if b == '&' && !cdata {
|
|
// Read escaped character expression up to semicolon.
|
|
// XML in all its glory allows a document to define and use
|
|
// its own character names with <!ENTITY ...> directives.
|
|
// Parsers are required to recognize lt, gt, amp, apos, and quot
|
|
// even if they have not been declared.
|
|
before := d.buf.Len()
|
|
d.buf.WriteByte('&')
|
|
var ok bool
|
|
var text string
|
|
var haveText bool
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil
|
|
}
|
|
if b == '#' {
|
|
d.buf.WriteByte(b)
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil
|
|
}
|
|
base := 10
|
|
if b == 'x' {
|
|
base = 16
|
|
d.buf.WriteByte(b)
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil
|
|
}
|
|
}
|
|
start := d.buf.Len()
|
|
for '0' <= b && b <= '9' ||
|
|
base == 16 && 'a' <= b && b <= 'f' ||
|
|
base == 16 && 'A' <= b && b <= 'F' {
|
|
d.buf.WriteByte(b)
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil
|
|
}
|
|
}
|
|
if b != ';' {
|
|
d.ungetc(b)
|
|
} else {
|
|
s := string(d.buf.Bytes()[start:])
|
|
d.buf.WriteByte(';')
|
|
n, err := strconv.ParseUint(s, base, 64)
|
|
if err == nil && n <= unicode.MaxRune {
|
|
text = string(n)
|
|
haveText = true
|
|
}
|
|
}
|
|
} else {
|
|
d.ungetc(b)
|
|
if !d.readName() {
|
|
if d.err != nil {
|
|
return nil
|
|
}
|
|
ok = false
|
|
}
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return nil
|
|
}
|
|
if b != ';' {
|
|
d.ungetc(b)
|
|
} else {
|
|
name := d.buf.Bytes()[before+1:]
|
|
d.buf.WriteByte(';')
|
|
if isName(name) {
|
|
s := string(name)
|
|
if r, ok := entity[s]; ok {
|
|
text = string(r)
|
|
haveText = true
|
|
} else if d.Entity != nil {
|
|
text, haveText = d.Entity[s]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if haveText {
|
|
d.buf.Truncate(before)
|
|
d.buf.Write([]byte(text))
|
|
b0, b1 = 0, 0
|
|
continue Input
|
|
}
|
|
if !d.Strict {
|
|
b0, b1 = 0, 0
|
|
continue Input
|
|
}
|
|
ent := string(d.buf.Bytes()[before:])
|
|
if ent[len(ent)-1] != ';' {
|
|
ent += " (no semicolon)"
|
|
}
|
|
d.err = d.syntaxError("invalid character entity " + ent)
|
|
return nil
|
|
}
|
|
|
|
// We must rewrite unescaped \r and \r\n into \n.
|
|
if b == '\r' {
|
|
d.buf.WriteByte('\n')
|
|
} else if b1 == '\r' && b == '\n' {
|
|
// Skip \r\n--we already wrote \n.
|
|
} else {
|
|
d.buf.WriteByte(b)
|
|
}
|
|
|
|
b0, b1 = b1, b
|
|
}
|
|
data := d.buf.Bytes()
|
|
data = data[0 : len(data)-trunc]
|
|
|
|
// Inspect each rune for being a disallowed character.
|
|
buf := data
|
|
for len(buf) > 0 {
|
|
r, size := utf8.DecodeRune(buf)
|
|
if r == utf8.RuneError && size == 1 {
|
|
d.err = d.syntaxError("invalid UTF-8")
|
|
return nil
|
|
}
|
|
buf = buf[size:]
|
|
if !isInCharacterRange(r) {
|
|
d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return data
|
|
}
|
|
|
|
// Decide whether the given rune is in the XML Character Range, per
|
|
// the Char production of http://www.xml.com/axml/testaxml.htm,
|
|
// Section 2.2 Characters.
|
|
func isInCharacterRange(r rune) (inrange bool) {
|
|
return r == 0x09 ||
|
|
r == 0x0A ||
|
|
r == 0x0D ||
|
|
r >= 0x20 && r <= 0xDF77 ||
|
|
r >= 0xE000 && r <= 0xFFFD ||
|
|
r >= 0x10000 && r <= 0x10FFFF
|
|
}
|
|
|
|
// Get name space name: name with a : stuck in the middle.
|
|
// The part before the : is the name space identifier.
|
|
func (d *Decoder) nsname() (name Name, ok bool) {
|
|
s, ok := d.name()
|
|
if !ok {
|
|
return
|
|
}
|
|
i := strings.Index(s, ":")
|
|
if i < 0 {
|
|
name.Local = s
|
|
} else {
|
|
name.Space = s[0:i]
|
|
name.Local = s[i+1:]
|
|
}
|
|
return name, true
|
|
}
|
|
|
|
// Get name: /first(first|second)*/
|
|
// Do not set d.err if the name is missing (unless unexpected EOF is received):
|
|
// let the caller provide better context.
|
|
func (d *Decoder) name() (s string, ok bool) {
|
|
d.buf.Reset()
|
|
if !d.readName() {
|
|
return "", false
|
|
}
|
|
|
|
// Now we check the characters.
|
|
s = d.buf.String()
|
|
if !isName([]byte(s)) {
|
|
d.err = d.syntaxError("invalid XML name: " + s)
|
|
return "", false
|
|
}
|
|
return s, true
|
|
}
|
|
|
|
// Read a name and append its bytes to d.buf.
|
|
// The name is delimited by any single-byte character not valid in names.
|
|
// All multi-byte characters are accepted; the caller must check their validity.
|
|
func (d *Decoder) readName() (ok bool) {
|
|
var b byte
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return
|
|
}
|
|
if b < utf8.RuneSelf && !isNameByte(b) {
|
|
d.ungetc(b)
|
|
return false
|
|
}
|
|
d.buf.WriteByte(b)
|
|
|
|
for {
|
|
if b, ok = d.mustgetc(); !ok {
|
|
return
|
|
}
|
|
if b < utf8.RuneSelf && !isNameByte(b) {
|
|
d.ungetc(b)
|
|
break
|
|
}
|
|
d.buf.WriteByte(b)
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isNameByte(c byte) bool {
|
|
return 'A' <= c && c <= 'Z' ||
|
|
'a' <= c && c <= 'z' ||
|
|
'0' <= c && c <= '9' ||
|
|
c == '_' || c == ':' || c == '.' || c == '-'
|
|
}
|
|
|
|
func isName(s []byte) bool {
|
|
if len(s) == 0 {
|
|
return false
|
|
}
|
|
c, n := utf8.DecodeRune(s)
|
|
if c == utf8.RuneError && n == 1 {
|
|
return false
|
|
}
|
|
if !unicode.Is(first, c) {
|
|
return false
|
|
}
|
|
for n < len(s) {
|
|
s = s[n:]
|
|
c, n = utf8.DecodeRune(s)
|
|
if c == utf8.RuneError && n == 1 {
|
|
return false
|
|
}
|
|
if !unicode.Is(first, c) && !unicode.Is(second, c) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func isNameString(s string) bool {
|
|
if len(s) == 0 {
|
|
return false
|
|
}
|
|
c, n := utf8.DecodeRuneInString(s)
|
|
if c == utf8.RuneError && n == 1 {
|
|
return false
|
|
}
|
|
if !unicode.Is(first, c) {
|
|
return false
|
|
}
|
|
for n < len(s) {
|
|
s = s[n:]
|
|
c, n = utf8.DecodeRuneInString(s)
|
|
if c == utf8.RuneError && n == 1 {
|
|
return false
|
|
}
|
|
if !unicode.Is(first, c) && !unicode.Is(second, c) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// These tables were generated by cut and paste from Appendix B of
|
|
// the XML spec at http://www.xml.com/axml/testaxml.htm
|
|
// and then reformatting. First corresponds to (Letter | '_' | ':')
|
|
// and second corresponds to NameChar.
|
|
|
|
var first = &unicode.RangeTable{
|
|
R16: []unicode.Range16{
|
|
{0x003A, 0x003A, 1},
|
|
{0x0041, 0x005A, 1},
|
|
{0x005F, 0x005F, 1},
|
|
{0x0061, 0x007A, 1},
|
|
{0x00C0, 0x00D6, 1},
|
|
{0x00D8, 0x00F6, 1},
|
|
{0x00F8, 0x00FF, 1},
|
|
{0x0100, 0x0131, 1},
|
|
{0x0134, 0x013E, 1},
|
|
{0x0141, 0x0148, 1},
|
|
{0x014A, 0x017E, 1},
|
|
{0x0180, 0x01C3, 1},
|
|
{0x01CD, 0x01F0, 1},
|
|
{0x01F4, 0x01F5, 1},
|
|
{0x01FA, 0x0217, 1},
|
|
{0x0250, 0x02A8, 1},
|
|
{0x02BB, 0x02C1, 1},
|
|
{0x0386, 0x0386, 1},
|
|
{0x0388, 0x038A, 1},
|
|
{0x038C, 0x038C, 1},
|
|
{0x038E, 0x03A1, 1},
|
|
{0x03A3, 0x03CE, 1},
|
|
{0x03D0, 0x03D6, 1},
|
|
{0x03DA, 0x03E0, 2},
|
|
{0x03E2, 0x03F3, 1},
|
|
{0x0401, 0x040C, 1},
|
|
{0x040E, 0x044F, 1},
|
|
{0x0451, 0x045C, 1},
|
|
{0x045E, 0x0481, 1},
|
|
{0x0490, 0x04C4, 1},
|
|
{0x04C7, 0x04C8, 1},
|
|
{0x04CB, 0x04CC, 1},
|
|
{0x04D0, 0x04EB, 1},
|
|
{0x04EE, 0x04F5, 1},
|
|
{0x04F8, 0x04F9, 1},
|
|
{0x0531, 0x0556, 1},
|
|
{0x0559, 0x0559, 1},
|
|
{0x0561, 0x0586, 1},
|
|
{0x05D0, 0x05EA, 1},
|
|
{0x05F0, 0x05F2, 1},
|
|
{0x0621, 0x063A, 1},
|
|
{0x0641, 0x064A, 1},
|
|
{0x0671, 0x06B7, 1},
|
|
{0x06BA, 0x06BE, 1},
|
|
{0x06C0, 0x06CE, 1},
|
|
{0x06D0, 0x06D3, 1},
|
|
{0x06D5, 0x06D5, 1},
|
|
{0x06E5, 0x06E6, 1},
|
|
{0x0905, 0x0939, 1},
|
|
{0x093D, 0x093D, 1},
|
|
{0x0958, 0x0961, 1},
|
|
{0x0985, 0x098C, 1},
|
|
{0x098F, 0x0990, 1},
|
|
{0x0993, 0x09A8, 1},
|
|
{0x09AA, 0x09B0, 1},
|
|
{0x09B2, 0x09B2, 1},
|
|
{0x09B6, 0x09B9, 1},
|
|
{0x09DC, 0x09DD, 1},
|
|
{0x09DF, 0x09E1, 1},
|
|
{0x09F0, 0x09F1, 1},
|
|
{0x0A05, 0x0A0A, 1},
|
|
{0x0A0F, 0x0A10, 1},
|
|
{0x0A13, 0x0A28, 1},
|
|
{0x0A2A, 0x0A30, 1},
|
|
{0x0A32, 0x0A33, 1},
|
|
{0x0A35, 0x0A36, 1},
|
|
{0x0A38, 0x0A39, 1},
|
|
{0x0A59, 0x0A5C, 1},
|
|
{0x0A5E, 0x0A5E, 1},
|
|
{0x0A72, 0x0A74, 1},
|
|
{0x0A85, 0x0A8B, 1},
|
|
{0x0A8D, 0x0A8D, 1},
|
|
{0x0A8F, 0x0A91, 1},
|
|
{0x0A93, 0x0AA8, 1},
|
|
{0x0AAA, 0x0AB0, 1},
|
|
{0x0AB2, 0x0AB3, 1},
|
|
{0x0AB5, 0x0AB9, 1},
|
|
{0x0ABD, 0x0AE0, 0x23},
|
|
{0x0B05, 0x0B0C, 1},
|
|
{0x0B0F, 0x0B10, 1},
|
|
{0x0B13, 0x0B28, 1},
|
|
{0x0B2A, 0x0B30, 1},
|
|
{0x0B32, 0x0B33, 1},
|
|
{0x0B36, 0x0B39, 1},
|
|
{0x0B3D, 0x0B3D, 1},
|
|
{0x0B5C, 0x0B5D, 1},
|
|
{0x0B5F, 0x0B61, 1},
|
|
{0x0B85, 0x0B8A, 1},
|
|
{0x0B8E, 0x0B90, 1},
|
|
{0x0B92, 0x0B95, 1},
|
|
{0x0B99, 0x0B9A, 1},
|
|
{0x0B9C, 0x0B9C, 1},
|
|
{0x0B9E, 0x0B9F, 1},
|
|
{0x0BA3, 0x0BA4, 1},
|
|
{0x0BA8, 0x0BAA, 1},
|
|
{0x0BAE, 0x0BB5, 1},
|
|
{0x0BB7, 0x0BB9, 1},
|
|
{0x0C05, 0x0C0C, 1},
|
|
{0x0C0E, 0x0C10, 1},
|
|
{0x0C12, 0x0C28, 1},
|
|
{0x0C2A, 0x0C33, 1},
|
|
{0x0C35, 0x0C39, 1},
|
|
{0x0C60, 0x0C61, 1},
|
|
{0x0C85, 0x0C8C, 1},
|
|
{0x0C8E, 0x0C90, 1},
|
|
{0x0C92, 0x0CA8, 1},
|
|
{0x0CAA, 0x0CB3, 1},
|
|
{0x0CB5, 0x0CB9, 1},
|
|
{0x0CDE, 0x0CDE, 1},
|
|
{0x0CE0, 0x0CE1, 1},
|
|
{0x0D05, 0x0D0C, 1},
|
|
{0x0D0E, 0x0D10, 1},
|
|
{0x0D12, 0x0D28, 1},
|
|
{0x0D2A, 0x0D39, 1},
|
|
{0x0D60, 0x0D61, 1},
|
|
{0x0E01, 0x0E2E, 1},
|
|
{0x0E30, 0x0E30, 1},
|
|
{0x0E32, 0x0E33, 1},
|
|
{0x0E40, 0x0E45, 1},
|
|
{0x0E81, 0x0E82, 1},
|
|
{0x0E84, 0x0E84, 1},
|
|
{0x0E87, 0x0E88, 1},
|
|
{0x0E8A, 0x0E8D, 3},
|
|
{0x0E94, 0x0E97, 1},
|
|
{0x0E99, 0x0E9F, 1},
|
|
{0x0EA1, 0x0EA3, 1},
|
|
{0x0EA5, 0x0EA7, 2},
|
|
{0x0EAA, 0x0EAB, 1},
|
|
{0x0EAD, 0x0EAE, 1},
|
|
{0x0EB0, 0x0EB0, 1},
|
|
{0x0EB2, 0x0EB3, 1},
|
|
{0x0EBD, 0x0EBD, 1},
|
|
{0x0EC0, 0x0EC4, 1},
|
|
{0x0F40, 0x0F47, 1},
|
|
{0x0F49, 0x0F69, 1},
|
|
{0x10A0, 0x10C5, 1},
|
|
{0x10D0, 0x10F6, 1},
|
|
{0x1100, 0x1100, 1},
|
|
{0x1102, 0x1103, 1},
|
|
{0x1105, 0x1107, 1},
|
|
{0x1109, 0x1109, 1},
|
|
{0x110B, 0x110C, 1},
|
|
{0x110E, 0x1112, 1},
|
|
{0x113C, 0x1140, 2},
|
|
{0x114C, 0x1150, 2},
|
|
{0x1154, 0x1155, 1},
|
|
{0x1159, 0x1159, 1},
|
|
{0x115F, 0x1161, 1},
|
|
{0x1163, 0x1169, 2},
|
|
{0x116D, 0x116E, 1},
|
|
{0x1172, 0x1173, 1},
|
|
{0x1175, 0x119E, 0x119E - 0x1175},
|
|
{0x11A8, 0x11AB, 0x11AB - 0x11A8},
|
|
{0x11AE, 0x11AF, 1},
|
|
{0x11B7, 0x11B8, 1},
|
|
{0x11BA, 0x11BA, 1},
|
|
{0x11BC, 0x11C2, 1},
|
|
{0x11EB, 0x11F0, 0x11F0 - 0x11EB},
|
|
{0x11F9, 0x11F9, 1},
|
|
{0x1E00, 0x1E9B, 1},
|
|
{0x1EA0, 0x1EF9, 1},
|
|
{0x1F00, 0x1F15, 1},
|
|
{0x1F18, 0x1F1D, 1},
|
|
{0x1F20, 0x1F45, 1},
|
|
{0x1F48, 0x1F4D, 1},
|
|
{0x1F50, 0x1F57, 1},
|
|
{0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
|
|
{0x1F5D, 0x1F5D, 1},
|
|
{0x1F5F, 0x1F7D, 1},
|
|
{0x1F80, 0x1FB4, 1},
|
|
{0x1FB6, 0x1FBC, 1},
|
|
{0x1FBE, 0x1FBE, 1},
|
|
{0x1FC2, 0x1FC4, 1},
|
|
{0x1FC6, 0x1FCC, 1},
|
|
{0x1FD0, 0x1FD3, 1},
|
|
{0x1FD6, 0x1FDB, 1},
|
|
{0x1FE0, 0x1FEC, 1},
|
|
{0x1FF2, 0x1FF4, 1},
|
|
{0x1FF6, 0x1FFC, 1},
|
|
{0x2126, 0x2126, 1},
|
|
{0x212A, 0x212B, 1},
|
|
{0x212E, 0x212E, 1},
|
|
{0x2180, 0x2182, 1},
|
|
{0x3007, 0x3007, 1},
|
|
{0x3021, 0x3029, 1},
|
|
{0x3041, 0x3094, 1},
|
|
{0x30A1, 0x30FA, 1},
|
|
{0x3105, 0x312C, 1},
|
|
{0x4E00, 0x9FA5, 1},
|
|
{0xAC00, 0xD7A3, 1},
|
|
},
|
|
}
|
|
|
|
var second = &unicode.RangeTable{
|
|
R16: []unicode.Range16{
|
|
{0x002D, 0x002E, 1},
|
|
{0x0030, 0x0039, 1},
|
|
{0x00B7, 0x00B7, 1},
|
|
{0x02D0, 0x02D1, 1},
|
|
{0x0300, 0x0345, 1},
|
|
{0x0360, 0x0361, 1},
|
|
{0x0387, 0x0387, 1},
|
|
{0x0483, 0x0486, 1},
|
|
{0x0591, 0x05A1, 1},
|
|
{0x05A3, 0x05B9, 1},
|
|
{0x05BB, 0x05BD, 1},
|
|
{0x05BF, 0x05BF, 1},
|
|
{0x05C1, 0x05C2, 1},
|
|
{0x05C4, 0x0640, 0x0640 - 0x05C4},
|
|
{0x064B, 0x0652, 1},
|
|
{0x0660, 0x0669, 1},
|
|
{0x0670, 0x0670, 1},
|
|
{0x06D6, 0x06DC, 1},
|
|
{0x06DD, 0x06DF, 1},
|
|
{0x06E0, 0x06E4, 1},
|
|
{0x06E7, 0x06E8, 1},
|
|
{0x06EA, 0x06ED, 1},
|
|
{0x06F0, 0x06F9, 1},
|
|
{0x0901, 0x0903, 1},
|
|
{0x093C, 0x093C, 1},
|
|
{0x093E, 0x094C, 1},
|
|
{0x094D, 0x094D, 1},
|
|
{0x0951, 0x0954, 1},
|
|
{0x0962, 0x0963, 1},
|
|
{0x0966, 0x096F, 1},
|
|
{0x0981, 0x0983, 1},
|
|
{0x09BC, 0x09BC, 1},
|
|
{0x09BE, 0x09BF, 1},
|
|
{0x09C0, 0x09C4, 1},
|
|
{0x09C7, 0x09C8, 1},
|
|
{0x09CB, 0x09CD, 1},
|
|
{0x09D7, 0x09D7, 1},
|
|
{0x09E2, 0x09E3, 1},
|
|
{0x09E6, 0x09EF, 1},
|
|
{0x0A02, 0x0A3C, 0x3A},
|
|
{0x0A3E, 0x0A3F, 1},
|
|
{0x0A40, 0x0A42, 1},
|
|
{0x0A47, 0x0A48, 1},
|
|
{0x0A4B, 0x0A4D, 1},
|
|
{0x0A66, 0x0A6F, 1},
|
|
{0x0A70, 0x0A71, 1},
|
|
{0x0A81, 0x0A83, 1},
|
|
{0x0ABC, 0x0ABC, 1},
|
|
{0x0ABE, 0x0AC5, 1},
|
|
{0x0AC7, 0x0AC9, 1},
|
|
{0x0ACB, 0x0ACD, 1},
|
|
{0x0AE6, 0x0AEF, 1},
|
|
{0x0B01, 0x0B03, 1},
|
|
{0x0B3C, 0x0B3C, 1},
|
|
{0x0B3E, 0x0B43, 1},
|
|
{0x0B47, 0x0B48, 1},
|
|
{0x0B4B, 0x0B4D, 1},
|
|
{0x0B56, 0x0B57, 1},
|
|
{0x0B66, 0x0B6F, 1},
|
|
{0x0B82, 0x0B83, 1},
|
|
{0x0BBE, 0x0BC2, 1},
|
|
{0x0BC6, 0x0BC8, 1},
|
|
{0x0BCA, 0x0BCD, 1},
|
|
{0x0BD7, 0x0BD7, 1},
|
|
{0x0BE7, 0x0BEF, 1},
|
|
{0x0C01, 0x0C03, 1},
|
|
{0x0C3E, 0x0C44, 1},
|
|
{0x0C46, 0x0C48, 1},
|
|
{0x0C4A, 0x0C4D, 1},
|
|
{0x0C55, 0x0C56, 1},
|
|
{0x0C66, 0x0C6F, 1},
|
|
{0x0C82, 0x0C83, 1},
|
|
{0x0CBE, 0x0CC4, 1},
|
|
{0x0CC6, 0x0CC8, 1},
|
|
{0x0CCA, 0x0CCD, 1},
|
|
{0x0CD5, 0x0CD6, 1},
|
|
{0x0CE6, 0x0CEF, 1},
|
|
{0x0D02, 0x0D03, 1},
|
|
{0x0D3E, 0x0D43, 1},
|
|
{0x0D46, 0x0D48, 1},
|
|
{0x0D4A, 0x0D4D, 1},
|
|
{0x0D57, 0x0D57, 1},
|
|
{0x0D66, 0x0D6F, 1},
|
|
{0x0E31, 0x0E31, 1},
|
|
{0x0E34, 0x0E3A, 1},
|
|
{0x0E46, 0x0E46, 1},
|
|
{0x0E47, 0x0E4E, 1},
|
|
{0x0E50, 0x0E59, 1},
|
|
{0x0EB1, 0x0EB1, 1},
|
|
{0x0EB4, 0x0EB9, 1},
|
|
{0x0EBB, 0x0EBC, 1},
|
|
{0x0EC6, 0x0EC6, 1},
|
|
{0x0EC8, 0x0ECD, 1},
|
|
{0x0ED0, 0x0ED9, 1},
|
|
{0x0F18, 0x0F19, 1},
|
|
{0x0F20, 0x0F29, 1},
|
|
{0x0F35, 0x0F39, 2},
|
|
{0x0F3E, 0x0F3F, 1},
|
|
{0x0F71, 0x0F84, 1},
|
|
{0x0F86, 0x0F8B, 1},
|
|
{0x0F90, 0x0F95, 1},
|
|
{0x0F97, 0x0F97, 1},
|
|
{0x0F99, 0x0FAD, 1},
|
|
{0x0FB1, 0x0FB7, 1},
|
|
{0x0FB9, 0x0FB9, 1},
|
|
{0x20D0, 0x20DC, 1},
|
|
{0x20E1, 0x3005, 0x3005 - 0x20E1},
|
|
{0x302A, 0x302F, 1},
|
|
{0x3031, 0x3035, 1},
|
|
{0x3099, 0x309A, 1},
|
|
{0x309D, 0x309E, 1},
|
|
{0x30FC, 0x30FE, 1},
|
|
},
|
|
}
|
|
|
|
// HTMLEntity is an entity map containing translations for the
|
|
// standard HTML entity characters.
|
|
var HTMLEntity = htmlEntity
|
|
|
|
var htmlEntity = map[string]string{
|
|
/*
|
|
hget http://www.w3.org/TR/html4/sgml/entities.html |
|
|
ssam '
|
|
,y /\>/ x/\<(.|\n)+/ s/\n/ /g
|
|
,x v/^\<!ENTITY/d
|
|
,s/\<!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/ "\1": "\\u\2",/g
|
|
'
|
|
*/
|
|
"nbsp": "\u00A0",
|
|
"iexcl": "\u00A1",
|
|
"cent": "\u00A2",
|
|
"pound": "\u00A3",
|
|
"curren": "\u00A4",
|
|
"yen": "\u00A5",
|
|
"brvbar": "\u00A6",
|
|
"sect": "\u00A7",
|
|
"uml": "\u00A8",
|
|
"copy": "\u00A9",
|
|
"ordf": "\u00AA",
|
|
"laquo": "\u00AB",
|
|
"not": "\u00AC",
|
|
"shy": "\u00AD",
|
|
"reg": "\u00AE",
|
|
"macr": "\u00AF",
|
|
"deg": "\u00B0",
|
|
"plusmn": "\u00B1",
|
|
"sup2": "\u00B2",
|
|
"sup3": "\u00B3",
|
|
"acute": "\u00B4",
|
|
"micro": "\u00B5",
|
|
"para": "\u00B6",
|
|
"middot": "\u00B7",
|
|
"cedil": "\u00B8",
|
|
"sup1": "\u00B9",
|
|
"ordm": "\u00BA",
|
|
"raquo": "\u00BB",
|
|
"frac14": "\u00BC",
|
|
"frac12": "\u00BD",
|
|
"frac34": "\u00BE",
|
|
"iquest": "\u00BF",
|
|
"Agrave": "\u00C0",
|
|
"Aacute": "\u00C1",
|
|
"Acirc": "\u00C2",
|
|
"Atilde": "\u00C3",
|
|
"Auml": "\u00C4",
|
|
"Aring": "\u00C5",
|
|
"AElig": "\u00C6",
|
|
"Ccedil": "\u00C7",
|
|
"Egrave": "\u00C8",
|
|
"Eacute": "\u00C9",
|
|
"Ecirc": "\u00CA",
|
|
"Euml": "\u00CB",
|
|
"Igrave": "\u00CC",
|
|
"Iacute": "\u00CD",
|
|
"Icirc": "\u00CE",
|
|
"Iuml": "\u00CF",
|
|
"ETH": "\u00D0",
|
|
"Ntilde": "\u00D1",
|
|
"Ograve": "\u00D2",
|
|
"Oacute": "\u00D3",
|
|
"Ocirc": "\u00D4",
|
|
"Otilde": "\u00D5",
|
|
"Ouml": "\u00D6",
|
|
"times": "\u00D7",
|
|
"Oslash": "\u00D8",
|
|
"Ugrave": "\u00D9",
|
|
"Uacute": "\u00DA",
|
|
"Ucirc": "\u00DB",
|
|
"Uuml": "\u00DC",
|
|
"Yacute": "\u00DD",
|
|
"THORN": "\u00DE",
|
|
"szlig": "\u00DF",
|
|
"agrave": "\u00E0",
|
|
"aacute": "\u00E1",
|
|
"acirc": "\u00E2",
|
|
"atilde": "\u00E3",
|
|
"auml": "\u00E4",
|
|
"aring": "\u00E5",
|
|
"aelig": "\u00E6",
|
|
"ccedil": "\u00E7",
|
|
"egrave": "\u00E8",
|
|
"eacute": "\u00E9",
|
|
"ecirc": "\u00EA",
|
|
"euml": "\u00EB",
|
|
"igrave": "\u00EC",
|
|
"iacute": "\u00ED",
|
|
"icirc": "\u00EE",
|
|
"iuml": "\u00EF",
|
|
"eth": "\u00F0",
|
|
"ntilde": "\u00F1",
|
|
"ograve": "\u00F2",
|
|
"oacute": "\u00F3",
|
|
"ocirc": "\u00F4",
|
|
"otilde": "\u00F5",
|
|
"ouml": "\u00F6",
|
|
"divide": "\u00F7",
|
|
"oslash": "\u00F8",
|
|
"ugrave": "\u00F9",
|
|
"uacute": "\u00FA",
|
|
"ucirc": "\u00FB",
|
|
"uuml": "\u00FC",
|
|
"yacute": "\u00FD",
|
|
"thorn": "\u00FE",
|
|
"yuml": "\u00FF",
|
|
"fnof": "\u0192",
|
|
"Alpha": "\u0391",
|
|
"Beta": "\u0392",
|
|
"Gamma": "\u0393",
|
|
"Delta": "\u0394",
|
|
"Epsilon": "\u0395",
|
|
"Zeta": "\u0396",
|
|
"Eta": "\u0397",
|
|
"Theta": "\u0398",
|
|
"Iota": "\u0399",
|
|
"Kappa": "\u039A",
|
|
"Lambda": "\u039B",
|
|
"Mu": "\u039C",
|
|
"Nu": "\u039D",
|
|
"Xi": "\u039E",
|
|
"Omicron": "\u039F",
|
|
"Pi": "\u03A0",
|
|
"Rho": "\u03A1",
|
|
"Sigma": "\u03A3",
|
|
"Tau": "\u03A4",
|
|
"Upsilon": "\u03A5",
|
|
"Phi": "\u03A6",
|
|
"Chi": "\u03A7",
|
|
"Psi": "\u03A8",
|
|
"Omega": "\u03A9",
|
|
"alpha": "\u03B1",
|
|
"beta": "\u03B2",
|
|
"gamma": "\u03B3",
|
|
"delta": "\u03B4",
|
|
"epsilon": "\u03B5",
|
|
"zeta": "\u03B6",
|
|
"eta": "\u03B7",
|
|
"theta": "\u03B8",
|
|
"iota": "\u03B9",
|
|
"kappa": "\u03BA",
|
|
"lambda": "\u03BB",
|
|
"mu": "\u03BC",
|
|
"nu": "\u03BD",
|
|
"xi": "\u03BE",
|
|
"omicron": "\u03BF",
|
|
"pi": "\u03C0",
|
|
"rho": "\u03C1",
|
|
"sigmaf": "\u03C2",
|
|
"sigma": "\u03C3",
|
|
"tau": "\u03C4",
|
|
"upsilon": "\u03C5",
|
|
"phi": "\u03C6",
|
|
"chi": "\u03C7",
|
|
"psi": "\u03C8",
|
|
"omega": "\u03C9",
|
|
"thetasym": "\u03D1",
|
|
"upsih": "\u03D2",
|
|
"piv": "\u03D6",
|
|
"bull": "\u2022",
|
|
"hellip": "\u2026",
|
|
"prime": "\u2032",
|
|
"Prime": "\u2033",
|
|
"oline": "\u203E",
|
|
"frasl": "\u2044",
|
|
"weierp": "\u2118",
|
|
"image": "\u2111",
|
|
"real": "\u211C",
|
|
"trade": "\u2122",
|
|
"alefsym": "\u2135",
|
|
"larr": "\u2190",
|
|
"uarr": "\u2191",
|
|
"rarr": "\u2192",
|
|
"darr": "\u2193",
|
|
"harr": "\u2194",
|
|
"crarr": "\u21B5",
|
|
"lArr": "\u21D0",
|
|
"uArr": "\u21D1",
|
|
"rArr": "\u21D2",
|
|
"dArr": "\u21D3",
|
|
"hArr": "\u21D4",
|
|
"forall": "\u2200",
|
|
"part": "\u2202",
|
|
"exist": "\u2203",
|
|
"empty": "\u2205",
|
|
"nabla": "\u2207",
|
|
"isin": "\u2208",
|
|
"notin": "\u2209",
|
|
"ni": "\u220B",
|
|
"prod": "\u220F",
|
|
"sum": "\u2211",
|
|
"minus": "\u2212",
|
|
"lowast": "\u2217",
|
|
"radic": "\u221A",
|
|
"prop": "\u221D",
|
|
"infin": "\u221E",
|
|
"ang": "\u2220",
|
|
"and": "\u2227",
|
|
"or": "\u2228",
|
|
"cap": "\u2229",
|
|
"cup": "\u222A",
|
|
"int": "\u222B",
|
|
"there4": "\u2234",
|
|
"sim": "\u223C",
|
|
"cong": "\u2245",
|
|
"asymp": "\u2248",
|
|
"ne": "\u2260",
|
|
"equiv": "\u2261",
|
|
"le": "\u2264",
|
|
"ge": "\u2265",
|
|
"sub": "\u2282",
|
|
"sup": "\u2283",
|
|
"nsub": "\u2284",
|
|
"sube": "\u2286",
|
|
"supe": "\u2287",
|
|
"oplus": "\u2295",
|
|
"otimes": "\u2297",
|
|
"perp": "\u22A5",
|
|
"sdot": "\u22C5",
|
|
"lceil": "\u2308",
|
|
"rceil": "\u2309",
|
|
"lfloor": "\u230A",
|
|
"rfloor": "\u230B",
|
|
"lang": "\u2329",
|
|
"rang": "\u232A",
|
|
"loz": "\u25CA",
|
|
"spades": "\u2660",
|
|
"clubs": "\u2663",
|
|
"hearts": "\u2665",
|
|
"diams": "\u2666",
|
|
"quot": "\u0022",
|
|
"amp": "\u0026",
|
|
"lt": "\u003C",
|
|
"gt": "\u003E",
|
|
"OElig": "\u0152",
|
|
"oelig": "\u0153",
|
|
"Scaron": "\u0160",
|
|
"scaron": "\u0161",
|
|
"Yuml": "\u0178",
|
|
"circ": "\u02C6",
|
|
"tilde": "\u02DC",
|
|
"ensp": "\u2002",
|
|
"emsp": "\u2003",
|
|
"thinsp": "\u2009",
|
|
"zwnj": "\u200C",
|
|
"zwj": "\u200D",
|
|
"lrm": "\u200E",
|
|
"rlm": "\u200F",
|
|
"ndash": "\u2013",
|
|
"mdash": "\u2014",
|
|
"lsquo": "\u2018",
|
|
"rsquo": "\u2019",
|
|
"sbquo": "\u201A",
|
|
"ldquo": "\u201C",
|
|
"rdquo": "\u201D",
|
|
"bdquo": "\u201E",
|
|
"dagger": "\u2020",
|
|
"Dagger": "\u2021",
|
|
"permil": "\u2030",
|
|
"lsaquo": "\u2039",
|
|
"rsaquo": "\u203A",
|
|
"euro": "\u20AC",
|
|
}
|
|
|
|
// HTMLAutoClose is the set of HTML elements that
|
|
// should be considered to close automatically.
|
|
var HTMLAutoClose = htmlAutoClose
|
|
|
|
var htmlAutoClose = []string{
|
|
/*
|
|
hget http://www.w3.org/TR/html4/loose.dtd |
|
|
9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/ "\1",/p' | tr A-Z a-z
|
|
*/
|
|
"basefont",
|
|
"br",
|
|
"area",
|
|
"link",
|
|
"img",
|
|
"param",
|
|
"hr",
|
|
"input",
|
|
"col",
|
|
"frame",
|
|
"isindex",
|
|
"base",
|
|
"meta",
|
|
}
|
|
|
|
var (
|
|
esc_quot = []byte(""") // shorter than """
|
|
esc_apos = []byte("'") // shorter than "'"
|
|
esc_amp = []byte("&")
|
|
esc_lt = []byte("<")
|
|
esc_gt = []byte(">")
|
|
esc_tab = []byte("	")
|
|
esc_nl = []byte("
")
|
|
esc_cr = []byte("
")
|
|
esc_fffd = []byte("\uFFFD") // Unicode replacement character
|
|
)
|
|
|
|
// EscapeText writes to w the properly escaped XML equivalent
|
|
// of the plain text data s.
|
|
func EscapeText(w io.Writer, s []byte) error {
|
|
var esc []byte
|
|
last := 0
|
|
for i := 0; i < len(s); {
|
|
r, width := utf8.DecodeRune(s[i:])
|
|
i += width
|
|
switch r {
|
|
case '"':
|
|
esc = esc_quot
|
|
case '\'':
|
|
esc = esc_apos
|
|
case '&':
|
|
esc = esc_amp
|
|
case '<':
|
|
esc = esc_lt
|
|
case '>':
|
|
esc = esc_gt
|
|
case '\t':
|
|
esc = esc_tab
|
|
case '\n':
|
|
esc = esc_nl
|
|
case '\r':
|
|
esc = esc_cr
|
|
default:
|
|
if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
|
|
esc = esc_fffd
|
|
break
|
|
}
|
|
continue
|
|
}
|
|
if _, err := w.Write(s[last : i-width]); err != nil {
|
|
return err
|
|
}
|
|
if _, err := w.Write(esc); err != nil {
|
|
return err
|
|
}
|
|
last = i
|
|
}
|
|
if _, err := w.Write(s[last:]); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// EscapeString writes to p the properly escaped XML equivalent
|
|
// of the plain text data s.
|
|
func (p *printer) EscapeString(s string) {
|
|
var esc []byte
|
|
last := 0
|
|
for i := 0; i < len(s); {
|
|
r, width := utf8.DecodeRuneInString(s[i:])
|
|
i += width
|
|
switch r {
|
|
case '"':
|
|
esc = esc_quot
|
|
case '\'':
|
|
esc = esc_apos
|
|
case '&':
|
|
esc = esc_amp
|
|
case '<':
|
|
esc = esc_lt
|
|
case '>':
|
|
esc = esc_gt
|
|
case '\t':
|
|
esc = esc_tab
|
|
case '\n':
|
|
esc = esc_nl
|
|
case '\r':
|
|
esc = esc_cr
|
|
default:
|
|
if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
|
|
esc = esc_fffd
|
|
break
|
|
}
|
|
continue
|
|
}
|
|
p.WriteString(s[last : i-width])
|
|
p.Write(esc)
|
|
last = i
|
|
}
|
|
p.WriteString(s[last:])
|
|
}
|
|
|
|
// Escape is like EscapeText but omits the error return value.
|
|
// It is provided for backwards compatibility with Go 1.0.
|
|
// Code targeting Go 1.1 or later should use EscapeText.
|
|
func Escape(w io.Writer, s []byte) {
|
|
EscapeText(w, s)
|
|
}
|
|
|
|
// procInstEncoding parses the `encoding="..."` or `encoding='...'`
|
|
// value out of the provided string, returning "" if not found.
|
|
func procInstEncoding(s string) string {
|
|
// TODO: this parsing is somewhat lame and not exact.
|
|
// It works for all actual cases, though.
|
|
idx := strings.Index(s, "encoding=")
|
|
if idx == -1 {
|
|
return ""
|
|
}
|
|
v := s[idx+len("encoding="):]
|
|
if v == "" {
|
|
return ""
|
|
}
|
|
if v[0] != '\'' && v[0] != '"' {
|
|
return ""
|
|
}
|
|
idx = strings.IndexRune(v[1:], rune(v[0]))
|
|
if idx == -1 {
|
|
return ""
|
|
}
|
|
return v[1 : idx+1]
|
|
}
|