xml.go 39.8 KB
Newer Older
Russ Cox's avatar
Russ Cox committed
1 2 3 4
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

Russ Cox's avatar
Russ Cox committed
5 6
// Package xml implements a simple XML 1.0 parser that
// understands XML name spaces.
Russ Cox's avatar
Russ Cox committed
7 8
package xml

9 10 11 12
// References:
//    Annotated XML spec: http://www.xml.com/axml/testaxml.htm
//    XML name spaces: http://www.w3.org/TR/REC-xml-names/

Russ Cox's avatar
Russ Cox committed
13 14 15
// TODO(rsc):
//	Test error handling.

Russ Cox's avatar
Russ Cox committed
16
import (
17 18
	"bufio"
	"bytes"
19
	"fmt"
20 21 22 23
	"io"
	"strconv"
	"strings"
	"unicode"
24
	"unicode/utf8"
Russ Cox's avatar
Russ Cox committed
25 26
)

Russ Cox's avatar
Russ Cox committed
27
// A SyntaxError represents a syntax error in the XML input stream.
28 29 30 31
type SyntaxError struct {
	Msg  string
	Line int
}
Russ Cox's avatar
Russ Cox committed
32

33
func (e *SyntaxError) Error() string {
34 35
	return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
}
Russ Cox's avatar
Russ Cox committed
36 37 38

// A Name represents an XML name (Local) annotated
// with a name space identifier (Space).
39
// In tokens returned by Decoder.Token, the Space identifier
Russ Cox's avatar
Russ Cox committed
40 41
// is given as a canonical URL, not the short prefix used
// in the document being parsed.
Russ Cox's avatar
Russ Cox committed
42
type Name struct {
43
	Space, Local string
Russ Cox's avatar
Russ Cox committed
44 45
}

Russ Cox's avatar
Russ Cox committed
46
// An Attr represents an attribute in an XML element (Name=Value).
Russ Cox's avatar
Russ Cox committed
47
type Attr struct {
48 49
	Name  Name
	Value string
Russ Cox's avatar
Russ Cox committed
50 51
}

Russ Cox's avatar
Russ Cox committed
52 53 54
// A Token is an interface holding one of the token types:
// StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
type Token interface{}
Russ Cox's avatar
Russ Cox committed
55

Russ Cox's avatar
Russ Cox committed
56 57
// A StartElement represents an XML start element.
type StartElement struct {
58 59
	Name Name
	Attr []Attr
Russ Cox's avatar
Russ Cox committed
60
}
Russ Cox's avatar
Russ Cox committed
61

Kyle Consalus's avatar
Kyle Consalus committed
62 63
func (e StartElement) Copy() StartElement {
	attrs := make([]Attr, len(e.Attr))
Russ Cox's avatar
Russ Cox committed
64
	copy(attrs, e.Attr)
Kyle Consalus's avatar
Kyle Consalus committed
65 66 67 68
	e.Attr = attrs
	return e
}

Russ Cox's avatar
Russ Cox committed
69
// An EndElement represents an XML end element.
Russ Cox's avatar
Russ Cox committed
70
type EndElement struct {
71
	Name Name
Russ Cox's avatar
Russ Cox committed
72
}
Russ Cox's avatar
Russ Cox committed
73

Russ Cox's avatar
Russ Cox committed
74 75 76 77
// A CharData represents XML character data (raw text),
// in which XML escape sequences have been replaced by
// the characters they represent.
type CharData []byte
Russ Cox's avatar
Russ Cox committed
78

Rob Pike's avatar
Rob Pike committed
79
func makeCopy(b []byte) []byte {
80 81 82
	b1 := make([]byte, len(b))
	copy(b1, b)
	return b1
Russ Cox's avatar
Russ Cox committed
83 84
}

85
func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
Russ Cox's avatar
Russ Cox committed
86

Russ Cox's avatar
Russ Cox committed
87 88 89 90
// A Comment represents an XML comment of the form <!--comment-->.
// The bytes do not include the <!-- and --> comment markers.
type Comment []byte

91
func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
Russ Cox's avatar
Russ Cox committed
92

Russ Cox's avatar
Russ Cox committed
93 94
// A ProcInst represents an XML processing instruction of the form <?target inst?>
type ProcInst struct {
95 96
	Target string
	Inst   []byte
Russ Cox's avatar
Russ Cox committed
97 98
}

Russ Cox's avatar
Russ Cox committed
99
func (p ProcInst) Copy() ProcInst {
100 101
	p.Inst = makeCopy(p.Inst)
	return p
Russ Cox's avatar
Russ Cox committed
102 103
}

Russ Cox's avatar
Russ Cox committed
104 105 106 107
// A Directive represents an XML directive of the form <!text>.
// The bytes do not include the <! and > markers.
type Directive []byte

108
func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
Russ Cox's avatar
Russ Cox committed
109

Kyle Consalus's avatar
Kyle Consalus committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
// CopyToken returns a copy of a Token.
func CopyToken(t Token) Token {
	switch v := t.(type) {
	case CharData:
		return v.Copy()
	case Comment:
		return v.Copy()
	case Directive:
		return v.Copy()
	case ProcInst:
		return v.Copy()
	case StartElement:
		return v.Copy()
	}
	return t
}

127
// A Decoder represents an XML parser reading a particular input stream.
Russ Cox's avatar
Russ Cox committed
128
// The parser assumes that its input is encoded in UTF-8.
129
type Decoder struct {
130 131 132 133 134 135 136 137 138 139 140 141
	// Strict defaults to true, enforcing the requirements
	// of the XML specification.
	// If set to false, the parser allows input containing common
	// mistakes:
	//	* If an element is missing an end tag, the parser invents
	//	  end tags as necessary to keep the return values from Token
	//	  properly balanced.
	//	* In attribute values and character data, unknown or malformed
	//	  character entities (sequences beginning with &) are left alone.
	//
	// Setting:
	//
142 143 144
	//	d.Strict = false;
	//	d.AutoClose = HTMLAutoClose;
	//	d.Entity = HTMLEntity
145 146
	//
	// creates a parser that can handle typical HTML.
147
	Strict bool
148 149 150 151

	// When Strict == false, AutoClose indicates a set of elements to
	// consider closed immediately after they are opened, regardless
	// of whether an end element is present.
152
	AutoClose []string
153 154 155 156 157 158 159 160

	// Entity can be used to map non-standard entity names to string replacements.
	// The parser behaves as if these standard mappings are present in the map,
	// regardless of the actual map content:
	//
	//	"lt": "<",
	//	"gt": ">",
	//	"amp": "&",
Nigel Tao's avatar
Nigel Tao committed
161
	//	"apos": "'",
162
	//	"quot": `"`,
163 164
	Entity map[string]string

165 166 167 168 169
	// CharsetReader, if non-nil, defines a function to generate
	// charset-conversion readers, converting from the provided
	// non-UTF-8 charset into UTF-8. If CharsetReader is nil or
	// returns an error, parsing stops with an error. One of the
	// the CharsetReader's result values must be non-nil.
170
	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
171

Robert Griesemer's avatar
Robert Griesemer committed
172
	r         io.ByteReader
173
	buf       bytes.Buffer
174
	saved     *bytes.Buffer
175 176 177 178 179 180 181
	stk       *stack
	free      *stack
	needClose bool
	toClose   Name
	nextToken Token
	nextByte  int
	ns        map[string]string
182
	err       error
183
	line      int
Russ Cox's avatar
Russ Cox committed
184 185
}

186 187 188
// NewDecoder creates a new XML parser reading from r.
func NewDecoder(r io.Reader) *Decoder {
	d := &Decoder{
189
		ns:       make(map[string]string),
Russ Cox's avatar
Russ Cox committed
190
		nextByte: -1,
191 192
		line:     1,
		Strict:   true,
193
	}
194 195
	d.switchToReader(r)
	return d
Russ Cox's avatar
Russ Cox committed
196 197 198
}

// Token returns the next XML token in the input stream.
199
// At the end of the input stream, Token returns nil, io.EOF.
Russ Cox's avatar
Russ Cox committed
200 201 202
//
// Slices of bytes in the returned token data refer to the
// parser's internal buffer and remain valid only until the next
Kyle Consalus's avatar
Kyle Consalus committed
203 204
// call to Token.  To acquire a copy of the bytes, call CopyToken
// or the token's Copy method.
Russ Cox's avatar
Russ Cox committed
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
//
// Token expands self-closing elements such as <br/>
// into separate start and end elements returned by successive calls.
//
// Token guarantees that the StartElement and EndElement
// tokens it returns are properly nested and matched:
// if Token encounters an unexpected end element,
// it will return an error.
//
// Token implements XML name spaces as described by
// http://www.w3.org/TR/REC-xml-names/.  Each of the
// Name structures contained in the Token has the Space
// set to the URL identifying its name space when known.
// If Token encounters an unrecognized name space prefix,
// it uses the prefix as the Space rather than report an error.
220 221 222 223 224
func (d *Decoder) Token() (t Token, err error) {
	if d.nextToken != nil {
		t = d.nextToken
		d.nextToken = nil
	} else if t, err = d.RawToken(); err != nil {
225
		return
Russ Cox's avatar
Russ Cox committed
226
	}
227

228 229 230
	if !d.Strict {
		if t1, ok := d.autoClose(t); ok {
			d.nextToken = t
231
			t = t1
232 233
		}
	}
Russ Cox's avatar
Russ Cox committed
234 235 236 237 238 239 240 241
	switch t1 := t.(type) {
	case StartElement:
		// In XML name spaces, the translations listed in the
		// attributes apply to the element name and
		// to the other attribute names, so process
		// the translations first.
		for _, a := range t1.Attr {
			if a.Name.Space == "xmlns" {
242 243 244
				v, ok := d.ns[a.Name.Local]
				d.pushNs(a.Name.Local, v, ok)
				d.ns[a.Name.Local] = a.Value
Russ Cox's avatar
Russ Cox committed
245 246 247
			}
			if a.Name.Space == "" && a.Name.Local == "xmlns" {
				// Default space for untagged names
248 249 250
				v, ok := d.ns[""]
				d.pushNs("", v, ok)
				d.ns[""] = a.Value
Russ Cox's avatar
Russ Cox committed
251 252 253
			}
		}

254
		d.translate(&t1.Name, true)
Russ Cox's avatar
Russ Cox committed
255
		for i := range t1.Attr {
256
			d.translate(&t1.Attr[i].Name, false)
Russ Cox's avatar
Russ Cox committed
257
		}
258
		d.pushElement(t1.Name)
259
		t = t1
Russ Cox's avatar
Russ Cox committed
260 261

	case EndElement:
262 263 264
		d.translate(&t1.Name, true)
		if !d.popElement(&t1) {
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
265
		}
266
		t = t1
Russ Cox's avatar
Russ Cox committed
267
	}
268
	return
Russ Cox's avatar
Russ Cox committed
269 270 271 272 273
}

// Apply name space translation to name n.
// The default name space (for Space=="")
// applies only to element names, not to attribute names.
274
func (d *Decoder) translate(n *Name, isElementName bool) {
Russ Cox's avatar
Russ Cox committed
275 276
	switch {
	case n.Space == "xmlns":
277
		return
Russ Cox's avatar
Russ Cox committed
278
	case n.Space == "" && !isElementName:
279
		return
Russ Cox's avatar
Russ Cox committed
280
	case n.Space == "" && n.Local == "xmlns":
281
		return
Russ Cox's avatar
Russ Cox committed
282
	}
283
	if v, ok := d.ns[n.Space]; ok {
284
		n.Space = v
Russ Cox's avatar
Russ Cox committed
285 286 287
	}
}

288
func (d *Decoder) switchToReader(r io.Reader) {
289 290 291 292 293
	// Get efficient byte at a time reader.
	// Assume that if reader has its own
	// ReadByte, it's efficient enough.
	// Otherwise, use bufio.
	if rb, ok := r.(io.ByteReader); ok {
294
		d.r = rb
295
	} else {
296
		d.r = bufio.NewReader(r)
297 298 299
	}
}

Russ Cox's avatar
Russ Cox committed
300 301 302 303 304
// Parsing state - stack holds old name space translations
// and the current set of open elements.  The translations to pop when
// ending a given tag are *below* it on the stack, which is
// more work but forced on us by XML.
type stack struct {
305 306 307 308
	next *stack
	kind int
	name Name
	ok   bool
Russ Cox's avatar
Russ Cox committed
309
}
Russ Cox's avatar
Russ Cox committed
310 311

const (
312 313
	stkStart = iota
	stkNs
Russ Cox's avatar
Russ Cox committed
314 315
)

316 317
func (d *Decoder) push(kind int) *stack {
	s := d.free
Russ Cox's avatar
Russ Cox committed
318
	if s != nil {
319
		d.free = s.next
Russ Cox's avatar
Russ Cox committed
320
	} else {
321
		s = new(stack)
Russ Cox's avatar
Russ Cox committed
322
	}
323
	s.next = d.stk
324
	s.kind = kind
325
	d.stk = s
326
	return s
Russ Cox's avatar
Russ Cox committed
327 328
}

329 330
func (d *Decoder) pop() *stack {
	s := d.stk
Russ Cox's avatar
Russ Cox committed
331
	if s != nil {
332 333 334
		d.stk = s.next
		s.next = d.free
		d.free = s
Russ Cox's avatar
Russ Cox committed
335
	}
336
	return s
Russ Cox's avatar
Russ Cox committed
337
}
Russ Cox's avatar
Russ Cox committed
338

Russ Cox's avatar
Russ Cox committed
339
// Record that we are starting an element with the given name.
340 341
func (d *Decoder) pushElement(name Name) {
	s := d.push(stkStart)
342
	s.name = name
Russ Cox's avatar
Russ Cox committed
343 344
}

Russ Cox's avatar
Russ Cox committed
345 346
// Record that we are changing the value of ns[local].
// The old value is url, ok.
347 348
func (d *Decoder) pushNs(local string, url string, ok bool) {
	s := d.push(stkNs)
349 350 351
	s.name.Local = local
	s.name.Space = url
	s.ok = ok
Russ Cox's avatar
Russ Cox committed
352 353
}

354
// Creates a SyntaxError with the current line number.
355 356
func (d *Decoder) syntaxError(msg string) error {
	return &SyntaxError{Msg: msg, Line: d.line}
357 358
}

Russ Cox's avatar
Russ Cox committed
359 360 361 362 363 364
// Record that we are ending an element with the given name.
// The name must match the record at the top of the stack,
// which must be a pushElement record.
// After popping the element, apply any undo records from
// the stack to restore the name translations that existed
// before we saw this element.
365 366
func (d *Decoder) popElement(t *EndElement) bool {
	s := d.pop()
367
	name := t.Name
Russ Cox's avatar
Russ Cox committed
368 369
	switch {
	case s == nil || s.kind != stkStart:
370
		d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
371
		return false
Russ Cox's avatar
Russ Cox committed
372
	case s.name.Local != name.Local:
373 374 375
		if !d.Strict {
			d.needClose = true
			d.toClose = t.Name
376 377
			t.Name = s.name
			return true
378
		}
379
		d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
380
		return false
Russ Cox's avatar
Russ Cox committed
381
	case s.name.Space != name.Space:
382
		d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
Russ Cox's avatar
Russ Cox committed
383
			"closed by </" + name.Local + "> in space " + name.Space)
384
		return false
Russ Cox's avatar
Russ Cox committed
385 386 387 388
	}

	// Pop stack until a Start is on the top, undoing the
	// translations that were associated with the element we just closed.
389 390
	for d.stk != nil && d.stk.kind != stkStart {
		s := d.pop()
Russ Cox's avatar
Russ Cox committed
391
		if s.ok {
392
			d.ns[s.name.Local] = s.name.Space
Russ Cox's avatar
Russ Cox committed
393
		} else {
394
			delete(d.ns, s.name.Local)
Russ Cox's avatar
Russ Cox committed
395
		}
Russ Cox's avatar
Russ Cox committed
396 397
	}

398
	return true
Russ Cox's avatar
Russ Cox committed
399 400
}

401 402
// If the top element on the stack is autoclosing and
// t is not the end tag, invent the end tag.
403 404
func (d *Decoder) autoClose(t Token) (Token, bool) {
	if d.stk == nil || d.stk.kind != stkStart {
405
		return nil, false
406
	}
407 408
	name := strings.ToLower(d.stk.name.Local)
	for _, s := range d.AutoClose {
409 410
		if strings.ToLower(s) == name {
			// This one should be auto closed if t doesn't close it.
411
			et, ok := t.(EndElement)
412
			if !ok || et.Name.Local != name {
413
				return EndElement{d.stk.name}, true
414
			}
415
			break
416 417
		}
	}
418
	return nil, false
419 420
}

Russ Cox's avatar
Russ Cox committed
421 422 423
// RawToken is like Token but does not verify that
// start and end elements match and does not translate
// name space prefixes to their corresponding URLs.
424 425 426
func (d *Decoder) RawToken() (Token, error) {
	if d.err != nil {
		return nil, d.err
Russ Cox's avatar
Russ Cox committed
427
	}
428
	if d.needClose {
Russ Cox's avatar
Russ Cox committed
429 430 431
		// The last element we read was self-closing and
		// we returned just the StartElement half.
		// Return the EndElement half now.
432 433
		d.needClose = false
		return EndElement{d.toClose}, nil
Russ Cox's avatar
Russ Cox committed
434 435
	}

436
	b, ok := d.getc()
Russ Cox's avatar
Russ Cox committed
437
	if !ok {
438
		return nil, d.err
Russ Cox's avatar
Russ Cox committed
439 440 441 442
	}

	if b != '<' {
		// Text section.
443 444
		d.ungetc(b)
		data := d.text(-1, false)
Russ Cox's avatar
Russ Cox committed
445
		if data == nil {
446
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
447
		}
448
		return CharData(data), nil
Russ Cox's avatar
Russ Cox committed
449 450
	}

451 452
	if b, ok = d.mustgetc(); !ok {
		return nil, d.err
Russ Cox's avatar
Russ Cox committed
453 454 455 456
	}
	switch b {
	case '/':
		// </: End element
457
		var name Name
458 459 460
		if name, ok = d.nsname(); !ok {
			if d.err == nil {
				d.err = d.syntaxError("expected element name after </")
Russ Cox's avatar
Russ Cox committed
461
			}
462
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
463
		}
464 465 466
		d.space()
		if b, ok = d.mustgetc(); !ok {
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
467 468
		}
		if b != '>' {
469 470
			d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
471
		}
472
		return EndElement{name}, nil
Russ Cox's avatar
Russ Cox committed
473 474 475 476 477

	case '?':
		// <?: Processing instruction.
		// TODO(rsc): Should parse the <?xml declaration to make sure
		// the version is 1.0 and the encoding is UTF-8.
478
		var target string
479 480 481
		if target, ok = d.name(); !ok {
			if d.err == nil {
				d.err = d.syntaxError("expected target name after <?")
482
			}
483
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
484
		}
485 486
		d.space()
		d.buf.Reset()
487
		var b0 byte
Russ Cox's avatar
Russ Cox committed
488
		for {
489 490
			if b, ok = d.mustgetc(); !ok {
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
491
			}
492
			d.buf.WriteByte(b)
Russ Cox's avatar
Russ Cox committed
493
			if b0 == '?' && b == '>' {
494
				break
Russ Cox's avatar
Russ Cox committed
495
			}
496
			b0 = b
Russ Cox's avatar
Russ Cox committed
497
		}
498
		data := d.buf.Bytes()
499
		data = data[0 : len(data)-2] // chop ?>
500 501 502 503

		if target == "xml" {
			enc := procInstEncoding(string(data))
			if enc != "" && enc != "utf-8" && enc != "UTF-8" {
504 505 506
				if d.CharsetReader == nil {
					d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
					return nil, d.err
507
				}
508
				newr, err := d.CharsetReader(enc, d.r.(io.Reader))
509
				if err != nil {
510 511
					d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
					return nil, d.err
512 513 514 515
				}
				if newr == nil {
					panic("CharsetReader returned a nil Reader for charset " + enc)
				}
516
				d.switchToReader(newr)
517 518
			}
		}
519
		return ProcInst{target, data}, nil
Russ Cox's avatar
Russ Cox committed
520 521 522

	case '!':
		// <!: Maybe comment, maybe CDATA.
523 524
		if b, ok = d.mustgetc(); !ok {
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
525 526
		}
		switch b {
527
		case '-': // <!-
Robert Griesemer's avatar
Robert Griesemer committed
528
			// Probably <!-- for a comment.
529 530
			if b, ok = d.mustgetc(); !ok {
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
531 532
			}
			if b != '-' {
533 534
				d.err = d.syntaxError("invalid sequence <!- not part of <!--")
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
535 536
			}
			// Look for terminator.
537
			d.buf.Reset()
538
			var b0, b1 byte
Russ Cox's avatar
Russ Cox committed
539
			for {
540 541
				if b, ok = d.mustgetc(); !ok {
					return nil, d.err
Russ Cox's avatar
Russ Cox committed
542
				}
543
				d.buf.WriteByte(b)
Russ Cox's avatar
Russ Cox committed
544
				if b0 == '-' && b1 == '-' && b == '>' {
545
					break
Russ Cox's avatar
Russ Cox committed
546
				}
547
				b0, b1 = b1, b
Russ Cox's avatar
Russ Cox committed
548
			}
549
			data := d.buf.Bytes()
550 551
			data = data[0 : len(data)-3] // chop -->
			return Comment(data), nil
Russ Cox's avatar
Russ Cox committed
552

553
		case '[': // <![
Robert Griesemer's avatar
Robert Griesemer committed
554
			// Probably <![CDATA[.
Abhinav Gupta's avatar
Abhinav Gupta committed
555
			for i := 0; i < 6; i++ {
556 557
				if b, ok = d.mustgetc(); !ok {
					return nil, d.err
Russ Cox's avatar
Russ Cox committed
558
				}
Abhinav Gupta's avatar
Abhinav Gupta committed
559
				if b != "CDATA["[i] {
560 561
					d.err = d.syntaxError("invalid <![ sequence")
					return nil, d.err
Russ Cox's avatar
Russ Cox committed
562 563 564
				}
			}
			// Have <![CDATA[.  Read text until ]]>.
565
			data := d.text(-1, true)
Russ Cox's avatar
Russ Cox committed
566
			if data == nil {
567
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
568
			}
569
			return CharData(data), nil
Russ Cox's avatar
Russ Cox committed
570 571 572
		}

		// Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
Chris Dollin's avatar
Chris Dollin committed
573 574
		// We don't care, but accumulate for caller. Quoted angle
		// brackets do not count for nesting.
575 576
		d.buf.Reset()
		d.buf.WriteByte(b)
Chris Dollin's avatar
Chris Dollin committed
577 578
		inquote := uint8(0)
		depth := 0
Russ Cox's avatar
Russ Cox committed
579
		for {
580 581
			if b, ok = d.mustgetc(); !ok {
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
582
			}
Chris Dollin's avatar
Chris Dollin committed
583
			if inquote == 0 && b == '>' && depth == 0 {
584
				break
Russ Cox's avatar
Russ Cox committed
585
			}
586
		HandleB:
587
			d.buf.WriteByte(b)
Chris Dollin's avatar
Chris Dollin committed
588 589 590 591 592 593 594 595 596 597 598 599 600 601
			switch {
			case b == inquote:
				inquote = 0

			case inquote != 0:
				// in quotes, no special action

			case b == '\'' || b == '"':
				inquote = b

			case b == '>' && inquote == 0:
				depth--

			case b == '<' && inquote == 0:
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630
				// Look for <!-- to begin comment.
				s := "!--"
				for i := 0; i < len(s); i++ {
					if b, ok = d.mustgetc(); !ok {
						return nil, d.err
					}
					if b != s[i] {
						for j := 0; j < i; j++ {
							d.buf.WriteByte(s[j])
						}
						depth++
						goto HandleB
					}
				}

				// Remove < that was written above.
				d.buf.Truncate(d.buf.Len() - 1)

				// Look for terminator.
				var b0, b1 byte
				for {
					if b, ok = d.mustgetc(); !ok {
						return nil, d.err
					}
					if b0 == '-' && b1 == '-' && b == '>' {
						break
					}
					b0, b1 = b1, b
				}
Chris Dollin's avatar
Chris Dollin committed
631
			}
Russ Cox's avatar
Russ Cox committed
632
		}
633
		return Directive(d.buf.Bytes()), nil
Russ Cox's avatar
Russ Cox committed
634 635 636
	}

	// Must be an open element like <a href="foo">
637
	d.ungetc(b)
Russ Cox's avatar
Russ Cox committed
638 639

	var (
640 641 642
		name  Name
		empty bool
		attr  []Attr
Russ Cox's avatar
Russ Cox committed
643
	)
644 645 646
	if name, ok = d.nsname(); !ok {
		if d.err == nil {
			d.err = d.syntaxError("expected element name after <")
Russ Cox's avatar
Russ Cox committed
647
		}
648
		return nil, d.err
Russ Cox's avatar
Russ Cox committed
649 650
	}

651
	attr = make([]Attr, 0, 4)
Russ Cox's avatar
Russ Cox committed
652
	for {
653 654 655
		d.space()
		if b, ok = d.mustgetc(); !ok {
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
656 657
		}
		if b == '/' {
658
			empty = true
659 660
			if b, ok = d.mustgetc(); !ok {
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
661 662
			}
			if b != '>' {
663 664
				d.err = d.syntaxError("expected /> in element")
				return nil, d.err
Russ Cox's avatar
Russ Cox committed
665
			}
666
			break
Russ Cox's avatar
Russ Cox committed
667 668
		}
		if b == '>' {
669
			break
Russ Cox's avatar
Russ Cox committed
670
		}
671
		d.ungetc(b)
Russ Cox's avatar
Russ Cox committed
672

673
		n := len(attr)
Russ Cox's avatar
Russ Cox committed
674
		if n >= cap(attr) {
675
			nattr := make([]Attr, n, 2*cap(attr))
Russ Cox's avatar
Russ Cox committed
676
			copy(nattr, attr)
677
			attr = nattr
Russ Cox's avatar
Russ Cox committed
678
		}
679 680
		attr = attr[0 : n+1]
		a := &attr[n]
681 682 683
		if a.Name, ok = d.nsname(); !ok {
			if d.err == nil {
				d.err = d.syntaxError("expected attribute name in element")
Russ Cox's avatar
Russ Cox committed
684
			}
685
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
686
		}
687 688 689
		d.space()
		if b, ok = d.mustgetc(); !ok {
			return nil, d.err
Russ Cox's avatar
Russ Cox committed
690 691
		}
		if b != '=' {
692 693 694
			if d.Strict {
				d.err = d.syntaxError("attribute name without = in element")
				return nil, d.err
695
			} else {
696
				d.ungetc(b)
697 698 699
				a.Value = a.Name.Local
			}
		} else {
700 701
			d.space()
			data := d.attrval()
702
			if data == nil {
703
				return nil, d.err
704 705
			}
			a.Value = string(data)
Russ Cox's avatar
Russ Cox committed
706 707 708
		}
	}
	if empty {
709 710
		d.needClose = true
		d.toClose = name
Russ Cox's avatar
Russ Cox committed
711
	}
712
	return StartElement{name, attr}, nil
Russ Cox's avatar
Russ Cox committed
713 714
}

715 716
func (d *Decoder) attrval() []byte {
	b, ok := d.mustgetc()
717 718 719 720 721
	if !ok {
		return nil
	}
	// Handle quoted attribute values
	if b == '"' || b == '\'' {
722
		return d.text(int(b), false)
723 724
	}
	// Handle unquoted attribute values for strict parsers
725 726
	if d.Strict {
		d.err = d.syntaxError("unquoted or missing attribute value in element")
727 728 729
		return nil
	}
	// Handle unquoted attribute values for unstrict parsers
730 731
	d.ungetc(b)
	d.buf.Reset()
732
	for {
733
		b, ok = d.mustgetc()
734 735 736 737 738 739
		if !ok {
			return nil
		}
		// http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
			'0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
740
			d.buf.WriteByte(b)
741
		} else {
742
			d.ungetc(b)
743 744 745
			break
		}
	}
746
	return d.buf.Bytes()
747 748
}

Russ Cox's avatar
Russ Cox committed
749
// Skip spaces if any
750
func (d *Decoder) space() {
Russ Cox's avatar
Russ Cox committed
751
	for {
752
		b, ok := d.getc()
Russ Cox's avatar
Russ Cox committed
753
		if !ok {
754
			return
Russ Cox's avatar
Russ Cox committed
755 756 757 758
		}
		switch b {
		case ' ', '\r', '\n', '\t':
		default:
759
			d.ungetc(b)
760
			return
Russ Cox's avatar
Russ Cox committed
761 762
		}
	}
Russ Cox's avatar
Russ Cox committed
763 764
}

Russ Cox's avatar
Russ Cox committed
765 766
// Read a single byte.
// If there is no byte to read, return ok==false
767
// and leave the error in d.err.
Russ Cox's avatar
Russ Cox committed
768
// Maintain line number.
769 770
func (d *Decoder) getc() (b byte, ok bool) {
	if d.err != nil {
771
		return 0, false
Russ Cox's avatar
Russ Cox committed
772
	}
773 774 775
	if d.nextByte >= 0 {
		b = byte(d.nextByte)
		d.nextByte = -1
Russ Cox's avatar
Russ Cox committed
776
	} else {
777 778
		b, d.err = d.r.ReadByte()
		if d.err != nil {
779
			return 0, false
Russ Cox's avatar
Russ Cox committed
780
		}
781 782
		if d.saved != nil {
			d.saved.WriteByte(b)
783
		}
Russ Cox's avatar
Russ Cox committed
784 785
	}
	if b == '\n' {
786
		d.line++
Russ Cox's avatar
Russ Cox committed
787
	}
788
	return b, true
Russ Cox's avatar
Russ Cox committed
789 790
}

791 792
// Return saved offset.
// If we did ungetc (nextByte >= 0), have to back up one.
793 794 795
func (d *Decoder) savedOffset() int {
	n := d.saved.Len()
	if d.nextByte >= 0 {
796 797 798 799 800
		n--
	}
	return n
}

801 802
// Must read a single byte.
// If there is no byte to read,
803
// set d.err to SyntaxError("unexpected EOF")
804
// and return ok==false
805 806 807 808
func (d *Decoder) mustgetc() (b byte, ok bool) {
	if b, ok = d.getc(); !ok {
		if d.err == io.EOF {
			d.err = d.syntaxError("unexpected EOF")
809 810
		}
	}
811
	return
812 813
}

Russ Cox's avatar
Russ Cox committed
814
// Unread a single byte.
815
func (d *Decoder) ungetc(b byte) {
Russ Cox's avatar
Russ Cox committed
816
	if b == '\n' {
817
		d.line--
Russ Cox's avatar
Russ Cox committed
818
	}
819
	d.nextByte = int(b)
Russ Cox's avatar
Russ Cox committed
820
}
Russ Cox's avatar
Russ Cox committed
821

Russ Cox's avatar
Russ Cox committed
822
var entity = map[string]int{
823 824 825
	"lt":   '<',
	"gt":   '>',
	"amp":  '&',
Russ Cox's avatar
Russ Cox committed
826 827 828
	"apos": '\'',
	"quot": '"',
}
Russ Cox's avatar
Russ Cox committed
829

Russ Cox's avatar
Russ Cox committed
830 831 832
// Read plain text section (XML calls it character data).
// If quote >= 0, we are in a quoted string and need to find the matching quote.
// If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
833 834
// On failure return nil and leave the error in d.err.
func (d *Decoder) text(quote int, cdata bool) []byte {
835 836
	var b0, b1 byte
	var trunc int
837
	d.buf.Reset()
Russ Cox's avatar
Russ Cox committed
838 839
Input:
	for {
840
		b, ok := d.getc()
Russ Cox's avatar
Russ Cox committed
841
		if !ok {
842
			if cdata {
843 844
				if d.err == io.EOF {
					d.err = d.syntaxError("unexpected EOF in CDATA section")
845 846 847 848
				}
				return nil
			}
			break Input
Russ Cox's avatar
Russ Cox committed
849 850 851 852 853 854
		}

		// <![CDATA[ section ends with ]]>.
		// It is an error for ]]> to appear in ordinary text.
		if b0 == ']' && b1 == ']' && b == '>' {
			if cdata {
855 856
				trunc = 2
				break Input
Russ Cox's avatar
Russ Cox committed
857
			}
858
			d.err = d.syntaxError("unescaped ]]> not in CDATA section")
859
			return nil
Russ Cox's avatar
Russ Cox committed
860 861 862 863 864
		}

		// Stop reading text if we see a <.
		if b == '<' && !cdata {
			if quote >= 0 {
865
				d.err = d.syntaxError("unescaped < inside quoted string")
866
				return nil
Russ Cox's avatar
Russ Cox committed
867
			}
868
			d.ungetc('<')
869
			break Input
Russ Cox's avatar
Russ Cox committed
870 871
		}
		if quote >= 0 && b == byte(quote) {
872
			break Input
Russ Cox's avatar
Russ Cox committed
873
		}
874
		if b == '&' && !cdata {
Russ Cox's avatar
Russ Cox committed
875 876 877 878
			// Read escaped character expression up to semicolon.
			// XML in all its glory allows a document to define and use
			// its own character names with <!ENTITY ...> directives.
			// Parsers are required to recognize lt, gt, amp, apos, and quot
879 880 881 882 883 884 885 886 887 888 889 890
			// even if they have not been declared.
			before := d.buf.Len()
			d.buf.WriteByte('&')
			var ok bool
			var text string
			var haveText bool
			if b, ok = d.mustgetc(); !ok {
				return nil
			}
			if b == '#' {
				d.buf.WriteByte(b)
				if b, ok = d.mustgetc(); !ok {
891
					return nil
Russ Cox's avatar
Russ Cox committed
892
				}
893 894 895 896 897 898
				base := 10
				if b == 'x' {
					base = 16
					d.buf.WriteByte(b)
					if b, ok = d.mustgetc(); !ok {
						return nil
899
					}
900
				}
901 902 903 904 905 906 907 908
				start := d.buf.Len()
				for '0' <= b && b <= '9' ||
					base == 16 && 'a' <= b && b <= 'f' ||
					base == 16 && 'A' <= b && b <= 'F' {
					d.buf.WriteByte(b)
					if b, ok = d.mustgetc(); !ok {
						return nil
					}
909
				}
910 911
				if b != ';' {
					d.ungetc(b)
912
				} else {
913 914 915 916 917 918 919
					s := string(d.buf.Bytes()[start:])
					d.buf.WriteByte(';')
					n, err := strconv.ParseUint(s, base, 64)
					if err == nil && n <= unicode.MaxRune {
						text = string(n)
						haveText = true
					}
Russ Cox's avatar
Russ Cox committed
920 921
				}
			} else {
922 923 924 925 926 927
				d.ungetc(b)
				if !d.readName() {
					if d.err != nil {
						return nil
					}
					ok = false
Russ Cox's avatar
Russ Cox committed
928
				}
929 930 931 932 933 934 935
				if b, ok = d.mustgetc(); !ok {
					return nil
				}
				if b != ';' {
					d.ungetc(b)
				} else {
					name := d.buf.Bytes()[before+1:]
936
					d.buf.WriteByte(';')
937 938 939 940 941 942 943 944 945
					if isName(name) {
						s := string(name)
						if r, ok := entity[s]; ok {
							text = string(r)
							haveText = true
						} else if d.Entity != nil {
							text, haveText = d.Entity[s]
						}
					}
946
				}
Russ Cox's avatar
Russ Cox committed
947
			}
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964

			if haveText {
				d.buf.Truncate(before)
				d.buf.Write([]byte(text))
				b0, b1 = 0, 0
				continue Input
			}
			if !d.Strict {
				b0, b1 = 0, 0
				continue Input
			}
			ent := string(d.buf.Bytes()[before])
			if ent[len(ent)-1] != ';' {
				ent += " (no semicolon)"
			}
			d.err = d.syntaxError("invalid character entity " + ent)
			return nil
Russ Cox's avatar
Russ Cox committed
965
		}
966 967 968 969 970 971 972 973 974 975

		// We must rewrite unescaped \r and \r\n into \n.
		if b == '\r' {
			d.buf.WriteByte('\n')
		} else if b1 == '\r' && b == '\n' {
			// Skip \r\n--we already wrote \n.
		} else {
			d.buf.WriteByte(b)
		}

976
		b0, b1 = b1, b
Russ Cox's avatar
Russ Cox committed
977
	}
978
	data := d.buf.Bytes()
979
	data = data[0 : len(data)-trunc]
Russ Cox's avatar
Russ Cox committed
980

981 982 983 984 985
	// Inspect each rune for being a disallowed character.
	buf := data
	for len(buf) > 0 {
		r, size := utf8.DecodeRune(buf)
		if r == utf8.RuneError && size == 1 {
986
			d.err = d.syntaxError("invalid UTF-8")
987 988 989 990
			return nil
		}
		buf = buf[size:]
		if !isInCharacterRange(r) {
991
			d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
992 993 994 995
			return nil
		}
	}

996
	return data
Russ Cox's avatar
Russ Cox committed
997 998
}

999 1000 1001
// Decide whether the given rune is in the XML Character Range, per
// the Char production of http://www.xml.com/axml/testaxml.htm,
// Section 2.2 Characters.
1002 1003 1004 1005 1006 1007 1008
func isInCharacterRange(r rune) (inrange bool) {
	return r == 0x09 ||
		r == 0x0A ||
		r == 0x0D ||
		r >= 0x20 && r <= 0xDF77 ||
		r >= 0xE000 && r <= 0xFFFD ||
		r >= 0x10000 && r <= 0x10FFFF
1009 1010
}

Russ Cox's avatar
Russ Cox committed
1011 1012
// Get name space name: name with a : stuck in the middle.
// The part before the : is the name space identifier.
1013 1014
func (d *Decoder) nsname() (name Name, ok bool) {
	s, ok := d.name()
Russ Cox's avatar
Russ Cox committed
1015
	if !ok {
1016
		return
Russ Cox's avatar
Russ Cox committed
1017
	}
1018
	i := strings.Index(s, ":")
Russ Cox's avatar
Russ Cox committed
1019
	if i < 0 {
1020
		name.Local = s
Russ Cox's avatar
Russ Cox committed
1021
	} else {
1022 1023
		name.Space = s[0:i]
		name.Local = s[i+1:]
Russ Cox's avatar
Russ Cox committed
1024
	}
1025
	return name, true
Russ Cox's avatar
Russ Cox committed
1026 1027 1028
}

// Get name: /first(first|second)*/
1029
// Do not set d.err if the name is missing (unless unexpected EOF is received):
1030
// let the caller provide better context.
1031
func (d *Decoder) name() (s string, ok bool) {
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049
	d.buf.Reset()
	if !d.readName() {
		return "", false
	}

	// Now we check the characters.
	s = d.buf.String()
	if !isName([]byte(s)) {
		d.err = d.syntaxError("invalid XML name: " + s)
		return "", false
	}
	return s, true
}

// Read a name and append its bytes to d.buf.
// The name is delimited by any single-byte character not valid in names.
// All multi-byte characters are accepted; the caller must check their validity.
func (d *Decoder) readName() (ok bool) {
1050
	var b byte
1051
	if b, ok = d.mustgetc(); !ok {
1052
		return
Russ Cox's avatar
Russ Cox committed
1053
	}
1054
	if b < utf8.RuneSelf && !isNameByte(b) {
1055
		d.ungetc(b)
1056
		return false
Russ Cox's avatar
Russ Cox committed
1057
	}
1058
	d.buf.WriteByte(b)
1059

Russ Cox's avatar
Russ Cox committed
1060
	for {
1061
		if b, ok = d.mustgetc(); !ok {
1062
			return
Russ Cox's avatar
Russ Cox committed
1063
		}
1064
		if b < utf8.RuneSelf && !isNameByte(b) {
1065
			d.ungetc(b)
1066
			break
Russ Cox's avatar
Russ Cox committed
1067
		}
1068
		d.buf.WriteByte(b)
Russ Cox's avatar
Russ Cox committed
1069
	}
1070
	return true
Russ Cox's avatar
Russ Cox committed
1071 1072
}

1073
func isNameByte(c byte) bool {
Russ Cox's avatar
Russ Cox committed
1074 1075
	return 'A' <= c && c <= 'Z' ||
		'a' <= c && c <= 'z' ||
1076
		'0' <= c && c <= '9' ||
1077
		c == '_' || c == ':' || c == '.' || c == '-'
Russ Cox's avatar
Russ Cox committed
1078 1079
}

1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
func isName(s []byte) bool {
	if len(s) == 0 {
		return false
	}
	c, n := utf8.DecodeRune(s)
	if c == utf8.RuneError && n == 1 {
		return false
	}
	if !unicode.Is(first, c) {
		return false
	}
	for n < len(s) {
		s = s[n:]
		c, n = utf8.DecodeRune(s)
		if c == utf8.RuneError && n == 1 {
			return false
		}
		if !unicode.Is(first, c) && !unicode.Is(second, c) {
			return false
		}
	}
	return true
}

1104 1105 1106 1107
// These tables were generated by cut and paste from Appendix B of
// the XML spec at http://www.xml.com/axml/testaxml.htm
// and then reformatting.  First corresponds to (Letter | '_' | ':')
// and second corresponds to NameChar.
Russ Cox's avatar
Russ Cox committed
1108

Rob Pike's avatar
Rob Pike committed
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
var first = &unicode.RangeTable{
	R16: []unicode.Range16{
		{0x003A, 0x003A, 1},
		{0x0041, 0x005A, 1},
		{0x005F, 0x005F, 1},
		{0x0061, 0x007A, 1},
		{0x00C0, 0x00D6, 1},
		{0x00D8, 0x00F6, 1},
		{0x00F8, 0x00FF, 1},
		{0x0100, 0x0131, 1},
		{0x0134, 0x013E, 1},
		{0x0141, 0x0148, 1},
		{0x014A, 0x017E, 1},
		{0x0180, 0x01C3, 1},
		{0x01CD, 0x01F0, 1},
		{0x01F4, 0x01F5, 1},
		{0x01FA, 0x0217, 1},
		{0x0250, 0x02A8, 1},
		{0x02BB, 0x02C1, 1},
		{0x0386, 0x0386, 1},
		{0x0388, 0x038A, 1},
		{0x038C, 0x038C, 1},
		{0x038E, 0x03A1, 1},
		{0x03A3, 0x03CE, 1},
		{0x03D0, 0x03D6, 1},
		{0x03DA, 0x03E0, 2},
		{0x03E2, 0x03F3, 1},
		{0x0401, 0x040C, 1},
		{0x040E, 0x044F, 1},
		{0x0451, 0x045C, 1},
		{0x045E, 0x0481, 1},
		{0x0490, 0x04C4, 1},
		{0x04C7, 0x04C8, 1},
		{0x04CB, 0x04CC, 1},
		{0x04D0, 0x04EB, 1},
		{0x04EE, 0x04F5, 1},
		{0x04F8, 0x04F9, 1},
		{0x0531, 0x0556, 1},
		{0x0559, 0x0559, 1},
		{0x0561, 0x0586, 1},
		{0x05D0, 0x05EA, 1},
		{0x05F0, 0x05F2, 1},
		{0x0621, 0x063A, 1},
		{0x0641, 0x064A, 1},
		{0x0671, 0x06B7, 1},
		{0x06BA, 0x06BE, 1},
		{0x06C0, 0x06CE, 1},
		{0x06D0, 0x06D3, 1},
		{0x06D5, 0x06D5, 1},
		{0x06E5, 0x06E6, 1},
		{0x0905, 0x0939, 1},
		{0x093D, 0x093D, 1},
		{0x0958, 0x0961, 1},
		{0x0985, 0x098C, 1},
		{0x098F, 0x0990, 1},
		{0x0993, 0x09A8, 1},
		{0x09AA, 0x09B0, 1},
		{0x09B2, 0x09B2, 1},
		{0x09B6, 0x09B9, 1},
		{0x09DC, 0x09DD, 1},
		{0x09DF, 0x09E1, 1},
		{0x09F0, 0x09F1, 1},
		{0x0A05, 0x0A0A, 1},
		{0x0A0F, 0x0A10, 1},
		{0x0A13, 0x0A28, 1},
		{0x0A2A, 0x0A30, 1},
		{0x0A32, 0x0A33, 1},
		{0x0A35, 0x0A36, 1},
		{0x0A38, 0x0A39, 1},
		{0x0A59, 0x0A5C, 1},
		{0x0A5E, 0x0A5E, 1},
		{0x0A72, 0x0A74, 1},
		{0x0A85, 0x0A8B, 1},
		{0x0A8D, 0x0A8D, 1},
		{0x0A8F, 0x0A91, 1},
		{0x0A93, 0x0AA8, 1},
		{0x0AAA, 0x0AB0, 1},
		{0x0AB2, 0x0AB3, 1},
		{0x0AB5, 0x0AB9, 1},
		{0x0ABD, 0x0AE0, 0x23},
		{0x0B05, 0x0B0C, 1},
		{0x0B0F, 0x0B10, 1},
		{0x0B13, 0x0B28, 1},
		{0x0B2A, 0x0B30, 1},
		{0x0B32, 0x0B33, 1},
		{0x0B36, 0x0B39, 1},
		{0x0B3D, 0x0B3D, 1},
		{0x0B5C, 0x0B5D, 1},
		{0x0B5F, 0x0B61, 1},
		{0x0B85, 0x0B8A, 1},
		{0x0B8E, 0x0B90, 1},
		{0x0B92, 0x0B95, 1},
		{0x0B99, 0x0B9A, 1},
		{0x0B9C, 0x0B9C, 1},
		{0x0B9E, 0x0B9F, 1},
		{0x0BA3, 0x0BA4, 1},
		{0x0BA8, 0x0BAA, 1},
		{0x0BAE, 0x0BB5, 1},
		{0x0BB7, 0x0BB9, 1},
		{0x0C05, 0x0C0C, 1},
		{0x0C0E, 0x0C10, 1},
		{0x0C12, 0x0C28, 1},
		{0x0C2A, 0x0C33, 1},
		{0x0C35, 0x0C39, 1},
		{0x0C60, 0x0C61, 1},
		{0x0C85, 0x0C8C, 1},
		{0x0C8E, 0x0C90, 1},
		{0x0C92, 0x0CA8, 1},
		{0x0CAA, 0x0CB3, 1},
		{0x0CB5, 0x0CB9, 1},
		{0x0CDE, 0x0CDE, 1},
		{0x0CE0, 0x0CE1, 1},
		{0x0D05, 0x0D0C, 1},
		{0x0D0E, 0x0D10, 1},
		{0x0D12, 0x0D28, 1},
		{0x0D2A, 0x0D39, 1},
		{0x0D60, 0x0D61, 1},
		{0x0E01, 0x0E2E, 1},
		{0x0E30, 0x0E30, 1},
		{0x0E32, 0x0E33, 1},
		{0x0E40, 0x0E45, 1},
		{0x0E81, 0x0E82, 1},
		{0x0E84, 0x0E84, 1},
		{0x0E87, 0x0E88, 1},
		{0x0E8A, 0x0E8D, 3},
		{0x0E94, 0x0E97, 1},
		{0x0E99, 0x0E9F, 1},
		{0x0EA1, 0x0EA3, 1},
		{0x0EA5, 0x0EA7, 2},
		{0x0EAA, 0x0EAB, 1},
		{0x0EAD, 0x0EAE, 1},
		{0x0EB0, 0x0EB0, 1},
		{0x0EB2, 0x0EB3, 1},
		{0x0EBD, 0x0EBD, 1},
		{0x0EC0, 0x0EC4, 1},
		{0x0F40, 0x0F47, 1},
		{0x0F49, 0x0F69, 1},
		{0x10A0, 0x10C5, 1},
		{0x10D0, 0x10F6, 1},
		{0x1100, 0x1100, 1},
		{0x1102, 0x1103, 1},
		{0x1105, 0x1107, 1},
		{0x1109, 0x1109, 1},
		{0x110B, 0x110C, 1},
		{0x110E, 0x1112, 1},
		{0x113C, 0x1140, 2},
		{0x114C, 0x1150, 2},
		{0x1154, 0x1155, 1},
		{0x1159, 0x1159, 1},
		{0x115F, 0x1161, 1},
		{0x1163, 0x1169, 2},
		{0x116D, 0x116E, 1},
		{0x1172, 0x1173, 1},
		{0x1175, 0x119E, 0x119E - 0x1175},
		{0x11A8, 0x11AB, 0x11AB - 0x11A8},
		{0x11AE, 0x11AF, 1},
		{0x11B7, 0x11B8, 1},
		{0x11BA, 0x11BA, 1},
		{0x11BC, 0x11C2, 1},
		{0x11EB, 0x11F0, 0x11F0 - 0x11EB},
		{0x11F9, 0x11F9, 1},
		{0x1E00, 0x1E9B, 1},
		{0x1EA0, 0x1EF9, 1},
		{0x1F00, 0x1F15, 1},
		{0x1F18, 0x1F1D, 1},
		{0x1F20, 0x1F45, 1},
		{0x1F48, 0x1F4D, 1},
		{0x1F50, 0x1F57, 1},
		{0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
		{0x1F5D, 0x1F5D, 1},
		{0x1F5F, 0x1F7D, 1},
		{0x1F80, 0x1FB4, 1},
		{0x1FB6, 0x1FBC, 1},
		{0x1FBE, 0x1FBE, 1},
		{0x1FC2, 0x1FC4, 1},
		{0x1FC6, 0x1FCC, 1},
		{0x1FD0, 0x1FD3, 1},
		{0x1FD6, 0x1FDB, 1},
		{0x1FE0, 0x1FEC, 1},
		{0x1FF2, 0x1FF4, 1},
		{0x1FF6, 0x1FFC, 1},
		{0x2126, 0x2126, 1},
		{0x212A, 0x212B, 1},
		{0x212E, 0x212E, 1},
		{0x2180, 0x2182, 1},
		{0x3007, 0x3007, 1},
		{0x3021, 0x3029, 1},
		{0x3041, 0x3094, 1},
		{0x30A1, 0x30FA, 1},
		{0x3105, 0x312C, 1},
		{0x4E00, 0x9FA5, 1},
		{0xAC00, 0xD7A3, 1},
	},
1302
}
Russ Cox's avatar
Russ Cox committed
1303

Rob Pike's avatar
Rob Pike committed
1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418
var second = &unicode.RangeTable{
	R16: []unicode.Range16{
		{0x002D, 0x002E, 1},
		{0x0030, 0x0039, 1},
		{0x00B7, 0x00B7, 1},
		{0x02D0, 0x02D1, 1},
		{0x0300, 0x0345, 1},
		{0x0360, 0x0361, 1},
		{0x0387, 0x0387, 1},
		{0x0483, 0x0486, 1},
		{0x0591, 0x05A1, 1},
		{0x05A3, 0x05B9, 1},
		{0x05BB, 0x05BD, 1},
		{0x05BF, 0x05BF, 1},
		{0x05C1, 0x05C2, 1},
		{0x05C4, 0x0640, 0x0640 - 0x05C4},
		{0x064B, 0x0652, 1},
		{0x0660, 0x0669, 1},
		{0x0670, 0x0670, 1},
		{0x06D6, 0x06DC, 1},
		{0x06DD, 0x06DF, 1},
		{0x06E0, 0x06E4, 1},
		{0x06E7, 0x06E8, 1},
		{0x06EA, 0x06ED, 1},
		{0x06F0, 0x06F9, 1},
		{0x0901, 0x0903, 1},
		{0x093C, 0x093C, 1},
		{0x093E, 0x094C, 1},
		{0x094D, 0x094D, 1},
		{0x0951, 0x0954, 1},
		{0x0962, 0x0963, 1},
		{0x0966, 0x096F, 1},
		{0x0981, 0x0983, 1},
		{0x09BC, 0x09BC, 1},
		{0x09BE, 0x09BF, 1},
		{0x09C0, 0x09C4, 1},
		{0x09C7, 0x09C8, 1},
		{0x09CB, 0x09CD, 1},
		{0x09D7, 0x09D7, 1},
		{0x09E2, 0x09E3, 1},
		{0x09E6, 0x09EF, 1},
		{0x0A02, 0x0A3C, 0x3A},
		{0x0A3E, 0x0A3F, 1},
		{0x0A40, 0x0A42, 1},
		{0x0A47, 0x0A48, 1},
		{0x0A4B, 0x0A4D, 1},
		{0x0A66, 0x0A6F, 1},
		{0x0A70, 0x0A71, 1},
		{0x0A81, 0x0A83, 1},
		{0x0ABC, 0x0ABC, 1},
		{0x0ABE, 0x0AC5, 1},
		{0x0AC7, 0x0AC9, 1},
		{0x0ACB, 0x0ACD, 1},
		{0x0AE6, 0x0AEF, 1},
		{0x0B01, 0x0B03, 1},
		{0x0B3C, 0x0B3C, 1},
		{0x0B3E, 0x0B43, 1},
		{0x0B47, 0x0B48, 1},
		{0x0B4B, 0x0B4D, 1},
		{0x0B56, 0x0B57, 1},
		{0x0B66, 0x0B6F, 1},
		{0x0B82, 0x0B83, 1},
		{0x0BBE, 0x0BC2, 1},
		{0x0BC6, 0x0BC8, 1},
		{0x0BCA, 0x0BCD, 1},
		{0x0BD7, 0x0BD7, 1},
		{0x0BE7, 0x0BEF, 1},
		{0x0C01, 0x0C03, 1},
		{0x0C3E, 0x0C44, 1},
		{0x0C46, 0x0C48, 1},
		{0x0C4A, 0x0C4D, 1},
		{0x0C55, 0x0C56, 1},
		{0x0C66, 0x0C6F, 1},
		{0x0C82, 0x0C83, 1},
		{0x0CBE, 0x0CC4, 1},
		{0x0CC6, 0x0CC8, 1},
		{0x0CCA, 0x0CCD, 1},
		{0x0CD5, 0x0CD6, 1},
		{0x0CE6, 0x0CEF, 1},
		{0x0D02, 0x0D03, 1},
		{0x0D3E, 0x0D43, 1},
		{0x0D46, 0x0D48, 1},
		{0x0D4A, 0x0D4D, 1},
		{0x0D57, 0x0D57, 1},
		{0x0D66, 0x0D6F, 1},
		{0x0E31, 0x0E31, 1},
		{0x0E34, 0x0E3A, 1},
		{0x0E46, 0x0E46, 1},
		{0x0E47, 0x0E4E, 1},
		{0x0E50, 0x0E59, 1},
		{0x0EB1, 0x0EB1, 1},
		{0x0EB4, 0x0EB9, 1},
		{0x0EBB, 0x0EBC, 1},
		{0x0EC6, 0x0EC6, 1},
		{0x0EC8, 0x0ECD, 1},
		{0x0ED0, 0x0ED9, 1},
		{0x0F18, 0x0F19, 1},
		{0x0F20, 0x0F29, 1},
		{0x0F35, 0x0F39, 2},
		{0x0F3E, 0x0F3F, 1},
		{0x0F71, 0x0F84, 1},
		{0x0F86, 0x0F8B, 1},
		{0x0F90, 0x0F95, 1},
		{0x0F97, 0x0F97, 1},
		{0x0F99, 0x0FAD, 1},
		{0x0FB1, 0x0FB7, 1},
		{0x0FB9, 0x0FB9, 1},
		{0x20D0, 0x20DC, 1},
		{0x20E1, 0x3005, 0x3005 - 0x20E1},
		{0x302A, 0x302F, 1},
		{0x3031, 0x3035, 1},
		{0x3099, 0x309A, 1},
		{0x309D, 0x309E, 1},
		{0x30FC, 0x30FE, 1},
	},
1419
}
1420 1421 1422 1423 1424

// HTMLEntity is an entity map containing translations for the
// standard HTML entity characters.
var HTMLEntity = htmlEntity

Robert Griesemer's avatar
Robert Griesemer committed
1425 1426 1427 1428 1429 1430 1431 1432 1433
var htmlEntity = map[string]string{
	/*
		hget http://www.w3.org/TR/html4/sgml/entities.html |
		ssam '
			,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
			,x v/^\&lt;!ENTITY/d
			,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/	"\1": "\\u\2",/g
		'
	*/
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579
	"nbsp":     "\u00A0",
	"iexcl":    "\u00A1",
	"cent":     "\u00A2",
	"pound":    "\u00A3",
	"curren":   "\u00A4",
	"yen":      "\u00A5",
	"brvbar":   "\u00A6",
	"sect":     "\u00A7",
	"uml":      "\u00A8",
	"copy":     "\u00A9",
	"ordf":     "\u00AA",
	"laquo":    "\u00AB",
	"not":      "\u00AC",
	"shy":      "\u00AD",
	"reg":      "\u00AE",
	"macr":     "\u00AF",
	"deg":      "\u00B0",
	"plusmn":   "\u00B1",
	"sup2":     "\u00B2",
	"sup3":     "\u00B3",
	"acute":    "\u00B4",
	"micro":    "\u00B5",
	"para":     "\u00B6",
	"middot":   "\u00B7",
	"cedil":    "\u00B8",
	"sup1":     "\u00B9",
	"ordm":     "\u00BA",
	"raquo":    "\u00BB",
	"frac14":   "\u00BC",
	"frac12":   "\u00BD",
	"frac34":   "\u00BE",
	"iquest":   "\u00BF",
	"Agrave":   "\u00C0",
	"Aacute":   "\u00C1",
	"Acirc":    "\u00C2",
	"Atilde":   "\u00C3",
	"Auml":     "\u00C4",
	"Aring":    "\u00C5",
	"AElig":    "\u00C6",
	"Ccedil":   "\u00C7",
	"Egrave":   "\u00C8",
	"Eacute":   "\u00C9",
	"Ecirc":    "\u00CA",
	"Euml":     "\u00CB",
	"Igrave":   "\u00CC",
	"Iacute":   "\u00CD",
	"Icirc":    "\u00CE",
	"Iuml":     "\u00CF",
	"ETH":      "\u00D0",
	"Ntilde":   "\u00D1",
	"Ograve":   "\u00D2",
	"Oacute":   "\u00D3",
	"Ocirc":    "\u00D4",
	"Otilde":   "\u00D5",
	"Ouml":     "\u00D6",
	"times":    "\u00D7",
	"Oslash":   "\u00D8",
	"Ugrave":   "\u00D9",
	"Uacute":   "\u00DA",
	"Ucirc":    "\u00DB",
	"Uuml":     "\u00DC",
	"Yacute":   "\u00DD",
	"THORN":    "\u00DE",
	"szlig":    "\u00DF",
	"agrave":   "\u00E0",
	"aacute":   "\u00E1",
	"acirc":    "\u00E2",
	"atilde":   "\u00E3",
	"auml":     "\u00E4",
	"aring":    "\u00E5",
	"aelig":    "\u00E6",
	"ccedil":   "\u00E7",
	"egrave":   "\u00E8",
	"eacute":   "\u00E9",
	"ecirc":    "\u00EA",
	"euml":     "\u00EB",
	"igrave":   "\u00EC",
	"iacute":   "\u00ED",
	"icirc":    "\u00EE",
	"iuml":     "\u00EF",
	"eth":      "\u00F0",
	"ntilde":   "\u00F1",
	"ograve":   "\u00F2",
	"oacute":   "\u00F3",
	"ocirc":    "\u00F4",
	"otilde":   "\u00F5",
	"ouml":     "\u00F6",
	"divide":   "\u00F7",
	"oslash":   "\u00F8",
	"ugrave":   "\u00F9",
	"uacute":   "\u00FA",
	"ucirc":    "\u00FB",
	"uuml":     "\u00FC",
	"yacute":   "\u00FD",
	"thorn":    "\u00FE",
	"yuml":     "\u00FF",
	"fnof":     "\u0192",
	"Alpha":    "\u0391",
	"Beta":     "\u0392",
	"Gamma":    "\u0393",
	"Delta":    "\u0394",
	"Epsilon":  "\u0395",
	"Zeta":     "\u0396",
	"Eta":      "\u0397",
	"Theta":    "\u0398",
	"Iota":     "\u0399",
	"Kappa":    "\u039A",
	"Lambda":   "\u039B",
	"Mu":       "\u039C",
	"Nu":       "\u039D",
	"Xi":       "\u039E",
	"Omicron":  "\u039F",
	"Pi":       "\u03A0",
	"Rho":      "\u03A1",
	"Sigma":    "\u03A3",
	"Tau":      "\u03A4",
	"Upsilon":  "\u03A5",
	"Phi":      "\u03A6",
	"Chi":      "\u03A7",
	"Psi":      "\u03A8",
	"Omega":    "\u03A9",
	"alpha":    "\u03B1",
	"beta":     "\u03B2",
	"gamma":    "\u03B3",
	"delta":    "\u03B4",
	"epsilon":  "\u03B5",
	"zeta":     "\u03B6",
	"eta":      "\u03B7",
	"theta":    "\u03B8",
	"iota":     "\u03B9",
	"kappa":    "\u03BA",
	"lambda":   "\u03BB",
	"mu":       "\u03BC",
	"nu":       "\u03BD",
	"xi":       "\u03BE",
	"omicron":  "\u03BF",
	"pi":       "\u03C0",
	"rho":      "\u03C1",
	"sigmaf":   "\u03C2",
	"sigma":    "\u03C3",
	"tau":      "\u03C4",
	"upsilon":  "\u03C5",
	"phi":      "\u03C6",
	"chi":      "\u03C7",
	"psi":      "\u03C8",
	"omega":    "\u03C9",
1580
	"thetasym": "\u03D1",
1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
	"upsih":    "\u03D2",
	"piv":      "\u03D6",
	"bull":     "\u2022",
	"hellip":   "\u2026",
	"prime":    "\u2032",
	"Prime":    "\u2033",
	"oline":    "\u203E",
	"frasl":    "\u2044",
	"weierp":   "\u2118",
	"image":    "\u2111",
	"real":     "\u211C",
	"trade":    "\u2122",
	"alefsym":  "\u2135",
	"larr":     "\u2190",
	"uarr":     "\u2191",
	"rarr":     "\u2192",
	"darr":     "\u2193",
	"harr":     "\u2194",
	"crarr":    "\u21B5",
	"lArr":     "\u21D0",
	"uArr":     "\u21D1",
	"rArr":     "\u21D2",
	"dArr":     "\u21D3",
	"hArr":     "\u21D4",
	"forall":   "\u2200",
	"part":     "\u2202",
	"exist":    "\u2203",
	"empty":    "\u2205",
	"nabla":    "\u2207",
	"isin":     "\u2208",
	"notin":    "\u2209",
	"ni":       "\u220B",
	"prod":     "\u220F",
	"sum":      "\u2211",
	"minus":    "\u2212",
	"lowast":   "\u2217",
	"radic":    "\u221A",
	"prop":     "\u221D",
	"infin":    "\u221E",
	"ang":      "\u2220",
	"and":      "\u2227",
	"or":       "\u2228",
	"cap":      "\u2229",
	"cup":      "\u222A",
	"int":      "\u222B",
	"there4":   "\u2234",
	"sim":      "\u223C",
	"cong":     "\u2245",
	"asymp":    "\u2248",
	"ne":       "\u2260",
	"equiv":    "\u2261",
	"le":       "\u2264",
	"ge":       "\u2265",
	"sub":      "\u2282",
	"sup":      "\u2283",
	"nsub":     "\u2284",
	"sube":     "\u2286",
	"supe":     "\u2287",
	"oplus":    "\u2295",
	"otimes":   "\u2297",
	"perp":     "\u22A5",
	"sdot":     "\u22C5",
	"lceil":    "\u2308",
	"rceil":    "\u2309",
	"lfloor":   "\u230A",
	"rfloor":   "\u230B",
	"lang":     "\u2329",
	"rang":     "\u232A",
	"loz":      "\u25CA",
	"spades":   "\u2660",
	"clubs":    "\u2663",
	"hearts":   "\u2665",
	"diams":    "\u2666",
	"quot":     "\u0022",
	"amp":      "\u0026",
	"lt":       "\u003C",
	"gt":       "\u003E",
	"OElig":    "\u0152",
	"oelig":    "\u0153",
	"Scaron":   "\u0160",
	"scaron":   "\u0161",
	"Yuml":     "\u0178",
	"circ":     "\u02C6",
	"tilde":    "\u02DC",
	"ensp":     "\u2002",
	"emsp":     "\u2003",
	"thinsp":   "\u2009",
	"zwnj":     "\u200C",
	"zwj":      "\u200D",
	"lrm":      "\u200E",
	"rlm":      "\u200F",
	"ndash":    "\u2013",
	"mdash":    "\u2014",
	"lsquo":    "\u2018",
	"rsquo":    "\u2019",
	"sbquo":    "\u201A",
	"ldquo":    "\u201C",
	"rdquo":    "\u201D",
	"bdquo":    "\u201E",
	"dagger":   "\u2020",
	"Dagger":   "\u2021",
	"permil":   "\u2030",
	"lsaquo":   "\u2039",
	"rsaquo":   "\u203A",
	"euro":     "\u20AC",
1686 1687 1688 1689 1690 1691
}

// HTMLAutoClose is the set of HTML elements that
// should be considered to close automatically.
var HTMLAutoClose = htmlAutoClose

Robert Griesemer's avatar
Robert Griesemer committed
1692 1693 1694
var htmlAutoClose = []string{
	/*
		hget http://www.w3.org/TR/html4/loose.dtd |
1695
		9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/	"\1",/p' | tr A-Z a-z
Robert Griesemer's avatar
Robert Griesemer committed
1696
	*/
1697 1698 1699 1700 1701 1702 1703 1704
	"basefont",
	"br",
	"area",
	"link",
	"img",
	"param",
	"hr",
	"input",
1705
	"col",
1706 1707 1708 1709 1710
	"frame",
	"isindex",
	"base",
	"meta",
}
1711 1712

var (
Russ Cox's avatar
Russ Cox committed
1713 1714 1715 1716 1717
	esc_quot = []byte("&#34;") // shorter than "&quot;"
	esc_apos = []byte("&#39;") // shorter than "&apos;"
	esc_amp  = []byte("&amp;")
	esc_lt   = []byte("&lt;")
	esc_gt   = []byte("&gt;")
1718 1719 1720
	esc_tab  = []byte("&#x9;")
	esc_nl   = []byte("&#xA;")
	esc_cr   = []byte("&#xD;")
1721 1722
)

1723
// EscapeText writes to w the properly escaped XML equivalent
1724
// of the plain text data s.
1725
func EscapeText(w io.Writer, s []byte) error {
1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739
	var esc []byte
	last := 0
	for i, c := range s {
		switch c {
		case '"':
			esc = esc_quot
		case '\'':
			esc = esc_apos
		case '&':
			esc = esc_amp
		case '<':
			esc = esc_lt
		case '>':
			esc = esc_gt
1740 1741 1742 1743 1744 1745
		case '\t':
			esc = esc_tab
		case '\n':
			esc = esc_nl
		case '\r':
			esc = esc_cr
1746 1747 1748
		default:
			continue
		}
1749 1750 1751 1752 1753 1754
		if _, err := w.Write(s[last:i]); err != nil {
			return err
		}
		if _, err := w.Write(esc); err != nil {
			return err
		}
1755 1756
		last = i + 1
	}
1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767
	if _, err := w.Write(s[last:]); err != nil {
		return err
	}
	return nil
}

// Escape is like EscapeText but omits the error return value.
// It is provided for backwards compatibility with Go 1.0.
// Code targeting Go 1.1 or later should use EscapeText.
func Escape(w io.Writer, s []byte) {
	EscapeText(w, s)
1768
}
1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785

// procInstEncoding parses the `encoding="..."` or `encoding='...'`
// value out of the provided string, returning "" if not found.
func procInstEncoding(s string) string {
	// TODO: this parsing is somewhat lame and not exact.
	// It works for all actual cases, though.
	idx := strings.Index(s, "encoding=")
	if idx == -1 {
		return ""
	}
	v := s[idx+len("encoding="):]
	if v == "" {
		return ""
	}
	if v[0] != '\'' && v[0] != '"' {
		return ""
	}
1786
	idx = strings.IndexRune(v[1:], rune(v[0]))
1787 1788 1789 1790 1791
	if idx == -1 {
		return ""
	}
	return v[1 : idx+1]
}