From f17e3d2288f8f076ff2b08d1ec31b04e1b65f237 Mon Sep 17 00:00:00 2001
From: Mike Samuel <mikesamuel@gmail.com>
Date: Wed, 28 Sep 2011 14:07:48 -0700
Subject: [PATCH] exp/template/html: handle custom attrs and HTML5 embedded
 elements.

HTML5 allows embedded SVG and MathML.
Code searches show SVG is used for graphing.

This changes transition to deal with constructs like
   <svg xmlns:xlink="http://www.w3.org/1999/xlink">
It changes attr and clients to call a single function that combines
the name lookup and "on" prefix check to determine an attribute
value type given an attribute name.

That function uses heuristics to recognize that
     xlink:href and svg:href
have URL content, and that data-url is likely contains URL content,
since "javascript:" injection is such a problem.

I did a code search over a closure templates codebase to determine
patterns of custom attribute usage.  I did something like

$ find . -name \*.soy | \
    xargs egrep perl -ne 'while (s/\b((data-|\w+:)\w+)\s*=//) { print "$1\n"; }' | \
    sort | uniq

to produce the list at the bottom.

Filtering that by egrep -i 'src|url|uri' produces

data-docConsumptionUri
data-docIconUrl
data-launchUrl
data-lazySrc
data-pageUrl
data-shareurl
data-suggestServerUrl
data-tweetUrl
g:secondaryurls
g:url

which seem to match all the ones that are likely URL content.
There are some short words that match that heuristic, but I still think it decent since
any custom attribute that has a numeric or enumerated keyword value will be unaffected by
the URL assumption.
Counterexamples from /usr/share/dict:
during, hourly, maturity, nourish, purloin, security, surly

Custom attributes present in existing closure templates codebase:
buzz:aid
data-a
data-action
data-actor
data-allowEqualityOps
data-analyticsId
data-bid
data-c
data-cartId
data-categoryId
data-cid
data-command
data-count
data-country
data-creativeId
data-cssToken
data-dest
data-docAttribution
data-docConsumptionUri
data-docCurrencyCode
data-docIconUrl
data-docId
data-docPrice
data-docPriceMicros
data-docTitle
data-docType
data-docid
data-email
data-entityid
data-errorindex
data-f
data-feature
data-fgid
data-filter
data-fireEvent
data-followable
data-followed
data-hashChange
data-height
data-hover
data-href
data-id
data-index
data-invitable
data-isFree
data-isPurchased
data-jid
data-jumpid
data-launchUrl
data-lazySrc
data-listType
data-maxVisiblePages
data-name
data-nid
data-nodeid
data-numItems
data-numPerPage
data-offerType
data-oid
data-opUsesEquality
data-overflowclass
data-packageName
data-pageId
data-pageUrl
data-pos
data-priceBrief
data-profileIds
data-query
data-rating
data-ref
data-rentalGrantPeriodDays
data-rentalactivePeriodHours
data-reviewId
data-role
data-score
data-shareurl
data-showGeLe
data-showLineInclude
data-size
data-sortval
data-suggestServerType
data-suggestServerUrl
data-suggestionIndex
data-tabBarId
data-tabBarIndex
data-tags
data-target
data-textColor
data-theme
data-title
data-toggletarget
data-tooltip
data-trailerId
data-transactionId
data-transition
data-ts
data-tweetContent
data-tweetUrl
data-type
data-useAjax
data-value
data-width
data-x
dm:index
dm:type
g:aspects
g:decorateusingsecondary
g:em
g:entity
g:groups
g:id
g:istoplevel
g:li
g:numresults
g:oid
g:parentId
g:pl
g:pt
g:rating_override
g:secondaryurls
g:sortby
g:startindex
g:target
g:type
g:url
g:value
ga:barsize
ga:css
ga:expandAfterCharsExceed
ga:initialNumRows
ga:nocancelicon
ga:numRowsToExpandTo
ga:type
ga:unlockwhenrated
gw:address
gw:businessname
gw:comment
gw:phone
gw:source
ng:controller
xlink:href
xml:lang
xmlns:atom
xmlns:dc
xmlns:jstd
xmlns:ng
xmlns:og
xmlns:webstore
xmlns:xlink

R=nigeltao
CC=golang-dev
https://golang.org/cl/5119041
---
 src/pkg/exp/template/html/attr.go        | 331 +++++++++++------------
 src/pkg/exp/template/html/escape_test.go |  60 ++++
 src/pkg/exp/template/html/html.go        |   2 +-
 src/pkg/exp/template/html/transition.go  |  37 ++-
 4 files changed, 247 insertions(+), 183 deletions(-)

diff --git a/src/pkg/exp/template/html/attr.go b/src/pkg/exp/template/html/attr.go
index cc57f8bd8a..6a36c7b718 100644
--- a/src/pkg/exp/template/html/attr.go
+++ b/src/pkg/exp/template/html/attr.go
@@ -4,181 +4,172 @@
 
 package html
 
-// attrType[n] describes the value of the given attribute.
+import (
+	"strings"
+)
+
+// attrTypeMap[n] describes the value of the given attribute.
 // If an attribute affects (or can mask) the encoding or interpretation of
 // other content, or affects the contents, idempotency, or credentials of a
 // network message, then the value in this map is contentTypeUnsafe.
 // This map is derived from HTML5, specifically
-// http://www.w3.org/TR/html5/Overview.html#attributes-1 and
-// http://www.w3.org/TR/html5/Overview.html#event-handlers-on-elements-document-objects-and-window-objects
+// http://www.w3.org/TR/html5/Overview.html#attributes-1
 // as well as "%URI"-typed attributes from
 // http://www.w3.org/TR/html4/index/attributes.html
-var attrType = map[string]contentType{
-	"accept":             contentTypePlain,
-	"accept-charset":     contentTypeUnsafe,
-	"action":             contentTypeURL,
-	"alt":                contentTypePlain,
-	"archive":            contentTypeURL,
-	"async":              contentTypeUnsafe,
-	"autocomplete":       contentTypePlain,
-	"autofocus":          contentTypePlain,
-	"autoplay":           contentTypePlain,
-	"background":         contentTypeURL,
-	"border":             contentTypePlain,
-	"checked":            contentTypePlain,
-	"cite":               contentTypeURL,
-	"challenge":          contentTypeUnsafe,
-	"charset":            contentTypeUnsafe,
-	"class":              contentTypePlain,
-	"classid":            contentTypeURL,
-	"codebase":           contentTypeURL,
-	"cols":               contentTypePlain,
-	"colspan":            contentTypePlain,
-	"content":            contentTypeUnsafe,
-	"contenteditable":    contentTypePlain,
-	"contextmenu":        contentTypePlain,
-	"controls":           contentTypePlain,
-	"coords":             contentTypePlain,
-	"crossorigin":        contentTypeUnsafe,
-	"data":               contentTypeURL,
-	"datetime":           contentTypePlain,
-	"default":            contentTypePlain,
-	"defer":              contentTypeUnsafe,
-	"dir":                contentTypePlain,
-	"dirname":            contentTypePlain,
-	"disabled":           contentTypePlain,
-	"draggable":          contentTypePlain,
-	"dropzone":           contentTypePlain,
-	"enctype":            contentTypeUnsafe,
-	"for":                contentTypePlain,
-	"form":               contentTypeUnsafe,
-	"formaction":         contentTypeURL,
-	"formenctype":        contentTypeUnsafe,
-	"formmethod":         contentTypeUnsafe,
-	"formnovalidate":     contentTypeUnsafe,
-	"formtarget":         contentTypePlain,
-	"headers":            contentTypePlain,
-	"height":             contentTypePlain,
-	"hidden":             contentTypePlain,
-	"high":               contentTypePlain,
-	"href":               contentTypeURL,
-	"hreflang":           contentTypePlain,
-	"http-equiv":         contentTypeUnsafe,
-	"icon":               contentTypeURL,
-	"id":                 contentTypePlain,
-	"ismap":              contentTypePlain,
-	"keytype":            contentTypeUnsafe,
-	"kind":               contentTypePlain,
-	"label":              contentTypePlain,
-	"lang":               contentTypePlain,
-	"language":           contentTypeUnsafe,
-	"list":               contentTypePlain,
-	"longdesc":           contentTypeURL,
-	"loop":               contentTypePlain,
-	"low":                contentTypePlain,
-	"manifest":           contentTypeURL,
-	"max":                contentTypePlain,
-	"maxlength":          contentTypePlain,
-	"media":              contentTypePlain,
-	"mediagroup":         contentTypePlain,
-	"method":             contentTypeUnsafe,
-	"min":                contentTypePlain,
-	"multiple":           contentTypePlain,
-	"name":               contentTypePlain,
-	"novalidate":         contentTypeUnsafe,
-	"onabort":            contentTypeJS,
-	"onblur":             contentTypeJS,
-	"oncanplay":          contentTypeJS,
-	"oncanplaythrough":   contentTypeJS,
-	"onchange":           contentTypeJS,
-	"onclick":            contentTypeJS,
-	"oncontextmenu":      contentTypeJS,
-	"oncuechange":        contentTypeJS,
-	"ondblclick":         contentTypeJS,
-	"ondrag":             contentTypeJS,
-	"ondragend":          contentTypeJS,
-	"ondragenter":        contentTypeJS,
-	"ondragleave":        contentTypeJS,
-	"ondragover":         contentTypeJS,
-	"ondragstart":        contentTypeJS,
-	"ondrop":             contentTypeJS,
-	"ondurationchange":   contentTypeJS,
-	"onemptied":          contentTypeJS,
-	"onended":            contentTypeJS,
-	"onerror":            contentTypeJS,
-	"onfocus":            contentTypeJS,
-	"oninput":            contentTypeJS,
-	"oninvalid":          contentTypeJS,
-	"onkeydown":          contentTypeJS,
-	"onkeypress":         contentTypeJS,
-	"onkeyup":            contentTypeJS,
-	"onload":             contentTypeJS,
-	"onloadeddata":       contentTypeJS,
-	"onloadedmetadata":   contentTypeJS,
-	"onloadstart":        contentTypeJS,
-	"onmousedown":        contentTypeJS,
-	"onmousemove":        contentTypeJS,
-	"onmouseout":         contentTypeJS,
-	"onmouseover":        contentTypeJS,
-	"onmouseup":          contentTypeJS,
-	"onmousewheel":       contentTypeJS,
-	"onpause":            contentTypeJS,
-	"onplay":             contentTypeJS,
-	"onplaying":          contentTypeJS,
-	"onprogress":         contentTypeJS,
-	"onratechange":       contentTypeJS,
-	"onreadystatechange": contentTypeJS,
-	"onreset":            contentTypeJS,
-	"onscroll":           contentTypeJS,
-	"onseeked":           contentTypeJS,
-	"onseeking":          contentTypeJS,
-	"onselect":           contentTypeJS,
-	"onshow":             contentTypeJS,
-	"onstalled":          contentTypeJS,
-	"onsubmit":           contentTypeJS,
-	"onsuspend":          contentTypeJS,
-	"ontimeupdate":       contentTypeJS,
-	"onvolumechange":     contentTypeJS,
-	"onwaiting":          contentTypeJS,
-	"open":               contentTypePlain,
-	"optimum":            contentTypePlain,
-	"pattern":            contentTypeUnsafe,
-	"placeholder":        contentTypePlain,
-	"poster":             contentTypeURL,
-	"profile":            contentTypeURL,
-	"preload":            contentTypePlain,
-	"pubdate":            contentTypePlain,
-	"radiogroup":         contentTypePlain,
-	"readonly":           contentTypePlain,
-	"rel":                contentTypeUnsafe,
-	"required":           contentTypePlain,
-	"reversed":           contentTypePlain,
-	"rows":               contentTypePlain,
-	"rowspan":            contentTypePlain,
-	"sandbox":            contentTypeUnsafe,
-	"spellcheck":         contentTypePlain,
-	"scope":              contentTypePlain,
-	"scoped":             contentTypePlain,
-	"seamless":           contentTypePlain,
-	"selected":           contentTypePlain,
-	"shape":              contentTypePlain,
-	"size":               contentTypePlain,
-	"sizes":              contentTypePlain,
-	"span":               contentTypePlain,
-	"src":                contentTypeURL,
-	"srcdoc":             contentTypeHTML,
-	"srclang":            contentTypePlain,
-	"start":              contentTypePlain,
-	"step":               contentTypePlain,
-	"style":              contentTypeCSS,
-	"tabindex":           contentTypePlain,
-	"target":             contentTypePlain,
-	"title":              contentTypePlain,
-	"type":               contentTypeUnsafe,
-	"usemap":             contentTypeURL,
-	"value":              contentTypeUnsafe,
-	"width":              contentTypePlain,
-	"wrap":               contentTypePlain,
+var attrTypeMap = map[string]contentType{
+	"accept":          contentTypePlain,
+	"accept-charset":  contentTypeUnsafe,
+	"action":          contentTypeURL,
+	"alt":             contentTypePlain,
+	"archive":         contentTypeURL,
+	"async":           contentTypeUnsafe,
+	"autocomplete":    contentTypePlain,
+	"autofocus":       contentTypePlain,
+	"autoplay":        contentTypePlain,
+	"background":      contentTypeURL,
+	"border":          contentTypePlain,
+	"checked":         contentTypePlain,
+	"cite":            contentTypeURL,
+	"challenge":       contentTypeUnsafe,
+	"charset":         contentTypeUnsafe,
+	"class":           contentTypePlain,
+	"classid":         contentTypeURL,
+	"codebase":        contentTypeURL,
+	"cols":            contentTypePlain,
+	"colspan":         contentTypePlain,
+	"content":         contentTypeUnsafe,
+	"contenteditable": contentTypePlain,
+	"contextmenu":     contentTypePlain,
+	"controls":        contentTypePlain,
+	"coords":          contentTypePlain,
+	"crossorigin":     contentTypeUnsafe,
+	"data":            contentTypeURL,
+	"datetime":        contentTypePlain,
+	"default":         contentTypePlain,
+	"defer":           contentTypeUnsafe,
+	"dir":             contentTypePlain,
+	"dirname":         contentTypePlain,
+	"disabled":        contentTypePlain,
+	"draggable":       contentTypePlain,
+	"dropzone":        contentTypePlain,
+	"enctype":         contentTypeUnsafe,
+	"for":             contentTypePlain,
+	"form":            contentTypeUnsafe,
+	"formaction":      contentTypeURL,
+	"formenctype":     contentTypeUnsafe,
+	"formmethod":      contentTypeUnsafe,
+	"formnovalidate":  contentTypeUnsafe,
+	"formtarget":      contentTypePlain,
+	"headers":         contentTypePlain,
+	"height":          contentTypePlain,
+	"hidden":          contentTypePlain,
+	"high":            contentTypePlain,
+	"href":            contentTypeURL,
+	"hreflang":        contentTypePlain,
+	"http-equiv":      contentTypeUnsafe,
+	"icon":            contentTypeURL,
+	"id":              contentTypePlain,
+	"ismap":           contentTypePlain,
+	"keytype":         contentTypeUnsafe,
+	"kind":            contentTypePlain,
+	"label":           contentTypePlain,
+	"lang":            contentTypePlain,
+	"language":        contentTypeUnsafe,
+	"list":            contentTypePlain,
+	"longdesc":        contentTypeURL,
+	"loop":            contentTypePlain,
+	"low":             contentTypePlain,
+	"manifest":        contentTypeURL,
+	"max":             contentTypePlain,
+	"maxlength":       contentTypePlain,
+	"media":           contentTypePlain,
+	"mediagroup":      contentTypePlain,
+	"method":          contentTypeUnsafe,
+	"min":             contentTypePlain,
+	"multiple":        contentTypePlain,
+	"name":            contentTypePlain,
+	"novalidate":      contentTypeUnsafe,
+	// Skip handler names from
+	// http://www.w3.org/TR/html5/Overview.html#event-handlers-on-elements-document-objects-and-window-objects
+	// since we have special handling in attrType.
+	"open":        contentTypePlain,
+	"optimum":     contentTypePlain,
+	"pattern":     contentTypeUnsafe,
+	"placeholder": contentTypePlain,
+	"poster":      contentTypeURL,
+	"profile":     contentTypeURL,
+	"preload":     contentTypePlain,
+	"pubdate":     contentTypePlain,
+	"radiogroup":  contentTypePlain,
+	"readonly":    contentTypePlain,
+	"rel":         contentTypeUnsafe,
+	"required":    contentTypePlain,
+	"reversed":    contentTypePlain,
+	"rows":        contentTypePlain,
+	"rowspan":     contentTypePlain,
+	"sandbox":     contentTypeUnsafe,
+	"spellcheck":  contentTypePlain,
+	"scope":       contentTypePlain,
+	"scoped":      contentTypePlain,
+	"seamless":    contentTypePlain,
+	"selected":    contentTypePlain,
+	"shape":       contentTypePlain,
+	"size":        contentTypePlain,
+	"sizes":       contentTypePlain,
+	"span":        contentTypePlain,
+	"src":         contentTypeURL,
+	"srcdoc":      contentTypeHTML,
+	"srclang":     contentTypePlain,
+	"start":       contentTypePlain,
+	"step":        contentTypePlain,
+	"style":       contentTypeCSS,
+	"tabindex":    contentTypePlain,
+	"target":      contentTypePlain,
+	"title":       contentTypePlain,
+	"type":        contentTypeUnsafe,
+	"usemap":      contentTypeURL,
+	"value":       contentTypeUnsafe,
+	"width":       contentTypePlain,
+	"wrap":        contentTypePlain,
+	"xmlns":       contentTypeURL,
+}
+
+// attrType returns a conservative (upper-bound on authority) guess at the
+// type of the named attribute.
+func attrType(name string) contentType {
+	name = strings.ToLower(name)
+	if strings.HasPrefix(name, "data-") {
+		// Strip data- so that custom attribute heuristics below are
+		// widely applied.
+		// Treat data-action as URL below.
+		name = name[5:]
+	} else if colon := strings.IndexRune(name, ':'); colon != -1 {
+		if name[:colon] == "xmlns" {
+			return contentTypeURL
+		}
+		// Treat svg:href and xlink:href as href below.
+		name = name[colon+1:]
+	}
+	if t, ok := attrTypeMap[name]; ok {
+		return t
+	}
+	// Treat partial event handler names as script.
+	if strings.HasPrefix(name, "on") {
+		return contentTypeJS
+	}
 
-	// TODO: data-* attrs?  Recognize data-foo-url and similar.
+	// Heuristics to prevent "javascript:..." injection in custom
+	// data attributes and custom attributes like g:tweetUrl.
+	// http://www.w3.org/TR/html5/elements.html#embedding-custom-non-visible-data-with-the-data-attributes:
+	// "Custom data attributes are intended to store custom data
+	//  private to the page or application, for which there are no
+	//  more appropriate attributes or elements."
+	// Developers seem to store URL content in data URLs that start
+	// or end with "URI" or "URL".
+	if strings.Contains(name, "src") ||
+		strings.Contains(name, "uri") ||
+		strings.Contains(name, "url") {
+		return contentTypeURL
+	}
+	return contentTypePlain
 }
diff --git a/src/pkg/exp/template/html/escape_test.go b/src/pkg/exp/template/html/escape_test.go
index 0ca3c56619..169cb76267 100644
--- a/src/pkg/exp/template/html/escape_test.go
+++ b/src/pkg/exp/template/html/escape_test.go
@@ -1400,6 +1400,66 @@ func TestEscapeText(t *testing.T) {
 			`<style>value`,
 			context{state: stateCSS, element: elementStyle},
 		},
+		{
+			`<a xlink:href`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a xmlns`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a xmlns:foo`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a xmlnsxyz`,
+			context{state: stateAttrName},
+		},
+		{
+			`<a data-url`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a data-iconUri`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a data-urlItem`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a g:`,
+			context{state: stateAttrName},
+		},
+		{
+			`<a g:url`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a g:iconUri`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a g:urlItem`,
+			context{state: stateAttrName, attr: attrURL},
+		},
+		{
+			`<a g:value`,
+			context{state: stateAttrName},
+		},
+		{
+			`<a svg:style='`,
+			context{state: stateCSS, delim: delimSingleQuote},
+		},
+		{
+			`<svg:font-face`,
+			context{state: stateTag},
+		},
+		{
+			`<svg:a svg:onclick="`,
+			context{state: stateJS, delim: delimDoubleQuote},
+		},
 	}
 
 	for _, test := range tests {
diff --git a/src/pkg/exp/template/html/html.go b/src/pkg/exp/template/html/html.go
index 6ef66dd6c3..91bb1b1704 100644
--- a/src/pkg/exp/template/html/html.go
+++ b/src/pkg/exp/template/html/html.go
@@ -230,7 +230,7 @@ func htmlNameFilter(args ...interface{}) string {
 		return filterFailsafe
 	}
 	s = strings.ToLower(s)
-	if t := attrType[s]; t != contentTypePlain && attrType["on"+s] != contentTypeJS {
+	if t := attrType(s); t != contentTypePlain {
 		// TODO: Split attr and element name part filters so we can whitelist
 		// attributes.
 		return filterFailsafe
diff --git a/src/pkg/exp/template/html/transition.go b/src/pkg/exp/template/html/transition.go
index d3c8a05291..49a1451174 100644
--- a/src/pkg/exp/template/html/transition.go
+++ b/src/pkg/exp/template/html/transition.go
@@ -106,18 +106,13 @@ func tTag(c context, s []byte) (context, int) {
 			err:   errorf(ErrBadHTML, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
 		}, len(s)
 	}
-	canonAttrName := strings.ToLower(string(s[i:j]))
-	switch attrType[canonAttrName] {
+	switch attrType(string(s[i:j])) {
 	case contentTypeURL:
 		attr = attrURL
 	case contentTypeCSS:
 		attr = attrStyle
 	case contentTypeJS:
 		attr = attrScript
-	default:
-		if strings.HasPrefix(canonAttrName, "on") {
-			attr = attrScript
-		}
 	}
 	if j == len(s) {
 		state = stateAttrName
@@ -512,16 +507,34 @@ var elementNameMap = map[string]element{
 	"title":    elementTitle,
 }
 
+// asciiAlpha returns whether c is an ASCII letter.
+func asciiAlpha(c byte) bool {
+	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
+}
+
+// asciiAlphaNum returns whether c is an ASCII letter or digit.
+func asciiAlphaNum(c byte) bool {
+	return asciiAlpha(c) || '0' <= c && c <= '9'
+}
+
 // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
 func eatTagName(s []byte, i int) (int, element) {
-	j := i
-	for ; j < len(s); j++ {
+	if i == len(s) || !asciiAlpha(s[i]) {
+		return i, elementNone
+	}
+	j := i + 1
+	for j < len(s) {
 		x := s[j]
-		if !(('a' <= x && x <= 'z') ||
-			('A' <= x && x <= 'Z') ||
-			('0' <= x && x <= '9' && i != j)) {
-			break
+		if asciiAlphaNum(x) {
+			j++
+			continue
+		}
+		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
+		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
+			j += 2
+			continue
 		}
+		break
 	}
 	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
 }
-- 
2.30.9