diagnostics: Implemented collection functions and create first metrics

- Also implemented robust error handling and failovers - Vendored klauspost/cpuid

diagnostics: Implemented collection functions and create first metrics
- Also implemented robust error handling and failovers - Vendored klauspost/cpuid
388ff6bc · Matthew Holt · 8f0b44b8 · 388ff6bc · 388ff6bc · 388ff6bc
Commit 388ff6bc authored Feb 08, 2018 by Matthew Holt
20 changed files
--- a/caddy/caddymain/run.go
+++ b/caddy/caddymain/run.go
@@ -27,6 +27,7 @@ import (
 	"strings"

 	"github.com/google/uuid"
+	"github.com/klauspost/cpuid"
 	"github.com/mholt/caddy"
 	"github.com/mholt/caddy/caddytls"
 	"github.com/mholt/caddy/diagnostics"
@@ -51,6 +52,7 @@ func init() {
 	flag.StringVar(&caddytls.DefaultEmail, "email", "", "Default ACME CA account email address")
 	flag.DurationVar(&acme.HTTPClient.Timeout, "catimeout", acme.HTTPClient.Timeout, "Default ACME CA HTTP timeout")
 	flag.StringVar(&logfile, "log", "", "Process log file")
+	flag.BoolVar(&noDiag, "no-diagnostics", false, "Disable diagnostic reporting")
 	flag.StringVar(&caddy.PidFile, "pidfile", "", "Path to write pid file")
 	flag.BoolVar(&caddy.Quiet, "quiet", false, "Quiet mode (no initialization output)")
 	flag.StringVar(&revoke, "revoke", "", "Hostname for which to revoke the certificate")
@@ -88,7 +90,9 @@ func Run() {
 	}

 	// initialize diagnostics client
-	initDiagnostics()
+	if !noDiag {
+		initDiagnostics()
+	}

 	// Check for one-time actions
 	if revoke != "" {
@@ -146,6 +150,23 @@ func Run() {
 	// Execute instantiation events
 	caddy.EmitEvent(caddy.InstanceStartupEvent, instance)

+	// Begin diagnostics (these are no-ops if diagnostics disabled)
+	diagnostics.Set("caddy_version", appVersion)
+	// TODO: plugins
+	diagnostics.Set("num_listeners", len(instance.Servers()))
+	diagnostics.Set("os", runtime.GOOS)
+	diagnostics.Set("arch", runtime.GOARCH)
+	diagnostics.Set("cpu", struct {
+		NumLogical int    `json:"num_logical"`
+		AESNI      bool   `json:"aes_ni"`
+		BrandName  string `json:"brand_name"`
+	}{
+		NumLogical: runtime.NumCPU(),
+		AESNI:      cpuid.CPU.AesNi(),
+		BrandName:  cpuid.CPU.BrandName,
+	})
+	diagnostics.StartEmitting()
+
 	// Twiddle your thumbs
 	instance.Wait()
 }
@@ -321,6 +342,7 @@ var (
 	version    bool
 	plugins    bool
 	validate   bool
+	noDiag     bool
 )

 // Build information obtained with the help of -ldflags

--- a/caddyhttp/httpserver/plugin.go
+++ b/caddyhttp/httpserver/plugin.go
@@ -29,6 +29,7 @@ import (
 	"github.com/mholt/caddy/caddyfile"
 	"github.com/mholt/caddy/caddyhttp/staticfiles"
 	"github.com/mholt/caddy/caddytls"
+	"github.com/mholt/caddy/diagnostics"
 )

 const serverType = "http"
@@ -205,6 +206,8 @@ func (h *httpContext) MakeServers() ([]caddy.Server, error) {
 		}
 	}

+	diagnostics.Set("num_sites", len(h.siteConfigs))
+
 	// we must map (group) each config to a bind address
 	groups, err := groupSiteConfigsByListenAddr(h.siteConfigs)
 	if err != nil {

--- a/caddyhttp/httpserver/server.go
+++ b/caddyhttp/httpserver/server.go
@@ -36,6 +36,7 @@ import (
 	"github.com/mholt/caddy"
 	"github.com/mholt/caddy/caddyhttp/staticfiles"
 	"github.com/mholt/caddy/caddytls"
+	"github.com/mholt/caddy/diagnostics"
 )

 // Server is the HTTP server implementation.
@@ -345,6 +346,8 @@ func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		}
 	}()

+	go diagnostics.AppendUniqueString("user_agent", r.Header.Get("User-Agent"))
+
 	// copy the original, unchanged URL into the context
 	// so it can be referenced by middlewares
 	urlCopy := *r.URL

--- a/caddytls/client.go
+++ b/caddytls/client.go
@@ -26,6 +26,7 @@ import (
 	"time"

 	"github.com/mholt/caddy"
+	"github.com/mholt/caddy/diagnostics"
 	"github.com/xenolf/lego/acme"
 )

@@ -276,6 +277,8 @@ Attempts:
 		break
 	}

+	go diagnostics.Increment("acme_certificates_obtained")
+
 	return nil
 }

@@ -350,8 +353,9 @@ func (c *ACMEClient) Renew(name string) error {
 		return errors.New("too many renewal attempts; last error: " + err.Error())
 	}

-	// Executes Cert renew events
 	caddy.EmitEvent(caddy.CertRenewEvent, name)
+	go diagnostics.Increment("acme_certificates_obtained")
+	go diagnostics.Increment("acme_certificates_renewed")

 	return saveCertResource(storage, newCertMeta)
 }

--- a/diagnostics/collection.go
+++ b/diagnostics/collection.go
+// Copyright 2015 Light Code Labs, LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package diagnostics
+
+import (
+	"log"
+
+	"github.com/google/uuid"
+)
+
+// Init initializes this package so that it may
+// be used. Do not call this function more than
+// once. Init panics if it is called more than
+// once or if the UUID value is empty. Once this
+// function is called, the rest of the package
+// may safely be used. If this function is not
+// called, the collector functions may still be
+// invoked, but they will be no-ops.
+func Init(instanceID uuid.UUID) {
+	if enabled {
+		panic("already initialized")
+	}
+	if instanceID.String() == "" {
+		panic("empty UUID")
+	}
+	instanceUUID = instanceID
+	enabled = true
+}
+
+// StartEmitting sends the current payload and begins the
+// transmission cycle for updates. This is the first
+// update sent, and future ones will be sent until
+// StopEmitting is called.
+//
+// This function is non-blocking (it spawns a new goroutine).
+//
+// This function panics if it was called more than once.
+// It is a no-op if this package was not initialized.
+func StartEmitting() {
+	if !enabled {
+		return
+	}
+	updateTimerMu.Lock()
+	if updateTimer != nil {
+		updateTimerMu.Unlock()
+		panic("updates already started")
+	}
+	updateTimerMu.Unlock()
+	updateMu.Lock()
+	if updating {
+		updateMu.Unlock()
+		panic("update already in progress")
+	}
+	updateMu.Unlock()
+	go logEmit(false)
+}
+
+// StopEmitting sends the current payload and terminates
+// the update cycle. No more updates will be sent.
+//
+// It is a no-op if the package was never initialized
+// or if emitting was never started.
+func StopEmitting() {
+	if !enabled {
+		return
+	}
+	updateTimerMu.Lock()
+	if updateTimer == nil {
+		updateTimerMu.Unlock()
+		return
+	}
+	updateTimerMu.Unlock()
+	logEmit(true)
+}
+
+// Set puts a value in the buffer to be included
+// in the next emission. It overwrites any
+// previous value.
+//
+// This function is safe for multiple goroutines,
+// and it is recommended to call this using the
+// go keyword after the call to SendHello so it
+// doesn't block crucial code.
+func Set(key string, val interface{}) {
+	if !enabled {
+		return
+	}
+	bufferMu.Lock()
+	if bufferItemCount >= maxBufferItems {
+		bufferMu.Unlock()
+		return
+	}
+	if _, ok := buffer[key]; !ok {
+		bufferItemCount++
+	}
+	buffer[key] = val
+	bufferMu.Unlock()
+}
+
+// Append appends value to a list named key.
+// If key is new, a new list will be created.
+// If key maps to a type that is not a list,
+// an error is logged, and this is a no-op.
+//
+// TODO: is this function needed/useful?
+func Append(key string, value interface{}) {
+	if !enabled {
+		return
+	}
+	bufferMu.Lock()
+	if bufferItemCount >= maxBufferItems {
+		bufferMu.Unlock()
+		return
+	}
+	// TODO: Test this...
+	bufVal, inBuffer := buffer[key]
+	sliceVal, sliceOk := bufVal.([]interface{})
+	if inBuffer && !sliceOk {
+		bufferMu.Unlock()
+		log.Printf("[PANIC] Diagnostics: key %s already used for non-slice value", key)
+		return
+	}
+	if sliceVal == nil {
+		buffer[key] = []interface{}{value}
+	} else if sliceOk {
+		buffer[key] = append(sliceVal, value)
+	}
+	bufferItemCount++
+	bufferMu.Unlock()
+}
+
+// AppendUniqueString adds value to a set named key.
+// Set items are unordered. Values in the set
+// are unique, but repeat values are counted.
+//
+// If key is new, a new set will be created.
+// If key maps to a type that is not a string
+// set, an error is logged, and this is a no-op.
+func AppendUniqueString(key, value string) {
+	if !enabled {
+		return
+	}
+	bufferMu.Lock()
+	if bufferItemCount >= maxBufferItems {
+		bufferMu.Unlock()
+		return
+	}
+	bufVal, inBuffer := buffer[key]
+	mapVal, mapOk := bufVal.(map[string]int)
+	if inBuffer && !mapOk {
+		bufferMu.Unlock()
+		log.Printf("[PANIC] Diagnostics: key %s already used for non-map value", key)
+		return
+	}
+	if mapVal == nil {
+		buffer[key] = map[string]int{value: 1}
+		bufferItemCount++
+	} else if mapOk {
+		mapVal[value]++
+	}
+	bufferMu.Unlock()
+}
+
+// AppendUniqueInt adds value to a set named key.
+// Set items are unordered. Values in the set
+// are unique, but repeat values are counted.
+//
+// If key is new, a new set will be created.
+// If key maps to a type that is not an integer
+// set, an error is logged, and this is a no-op.
+func AppendUniqueInt(key string, value int) {
+	if !enabled {
+		return
+	}
+	bufferMu.Lock()
+	if bufferItemCount >= maxBufferItems {
+		bufferMu.Unlock()
+		return
+	}
+	bufVal, inBuffer := buffer[key]
+	mapVal, mapOk := bufVal.(map[int]int)
+	if inBuffer && !mapOk {
+		bufferMu.Unlock()
+		log.Printf("[PANIC] Diagnostics: key %s already used for non-map value", key)
+		return
+	}
+	if mapVal == nil {
+		buffer[key] = map[int]int{value: 1}
+		bufferItemCount++
+	} else if mapOk {
+		mapVal[value]++
+	}
+	bufferMu.Unlock()
+}
+
+// Increment adds 1 to a value named key.
+// If it does not exist, it is created with
+// a value of 1. If key maps to a type that
+// is not an integer, an error is logged,
+// and this is a no-op.
+func Increment(key string) {
+	incrementOrDecrement(key, true)
+}
+
+// Decrement is the same as increment except
+// it subtracts 1.
+func Decrement(key string) {
+	incrementOrDecrement(key, false)
+}
+
+// inc == true:  increment
+// inc == false: decrement
+func incrementOrDecrement(key string, inc bool) {
+	if !enabled {
+		return
+	}
+	bufferMu.Lock()
+	bufVal, inBuffer := buffer[key]
+	intVal, intOk := bufVal.(int)
+	if inBuffer && !intOk {
+		bufferMu.Unlock()
+		log.Printf("[PANIC] Diagnostics: key %s already used for non-integer value", key)
+		return
+	}
+	if !inBuffer {
+		if bufferItemCount >= maxBufferItems {
+			bufferMu.Unlock()
+			return
+		}
+		bufferItemCount++
+	}
+	if inc {
+		buffer[key] = intVal + 1
+	} else {
+		buffer[key] = intVal - 1
+	}
+	bufferMu.Unlock()
+}
--- a/diagnostics/diagnostics.go
+++ b/diagnostics/diagnostics.go
@@ -12,14 +12,252 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+// Package diagnostics implements the client for server-side diagnostics
+// of the network. Functions in this package are synchronous and blocking
+// unless otherwise specified. For convenience, most functions here do
+// not return errors, but errors are logged to the standard logger.
+//
+// To use this package, first call Init(). You can then call any of the
+// collection/aggregation functions. Call StartEmitting() when you are
+// ready to begin sending diagnostic updates.
+//
+// When collecting metrics (functions like Set, Append*, or Increment),
+// it may be desirable and even recommended to run invoke them in a new
+// goroutine (use the go keyword) in case there is lock contention;
+// they are thread-safe (unless noted), and you may not want them to
+// block the main thread of execution. However, sometimes blocking
+// may be necessary too; for example, adding startup metrics to the
+// buffer before the call to StartEmitting().
 package diagnostics

 import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"log"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
 	"github.com/google/uuid"
 )

-func Init(uuid uuid.UUID) {
-	instanceUUID = uuid
+// logEmit calls emit and then logs the error, if any.
+func logEmit(final bool) {
+	err := emit(final)
+	if err != nil {
+		log.Printf("[ERROR] Sending diganostics: %v", err)
+	}
+}
+
+// emit sends an update to the diagnostics server.
+// If final is true, no future updates will be scheduled.
+// Otherwise, the next update will be scheduled.
+func emit(final bool) error {
+	if !enabled {
+		return fmt.Errorf("diagnostics not enabled")
+	}
+
+	// ensure only one update happens at a time;
+	// skip update if previous one still in progress
+	updateMu.Lock()
+	if updating {
+		updateMu.Unlock()
+		log.Println("[NOTICE] Skipping this diagnostics update because previous one is still working")
+		return nil
+	}
+	updating = true
+	updateMu.Unlock()
+	defer func() {
+		updateMu.Lock()
+		updating = false
+		updateMu.Unlock()
+	}()
+
+	// terminate any pending update if this is the last one
+	if final {
+		updateTimerMu.Lock()
+		updateTimer.Stop()
+		updateTimer = nil
+		updateTimerMu.Unlock()
+	}
+
+	payloadBytes, err := makePayloadAndResetBuffer()
+	if err != nil {
+		return err
+	}
+
+	// this will hold the server's reply
+	var reply Response
+
+	// transmit the payload - use a loop to retry in case of failure
+	for i := 0; i < 4; i++ {
+		if i > 0 && err != nil {
+			// don't hammer the server; first failure might have been
+			// a fluke, but back off more after that
+			log.Printf("[WARNING] Sending diagnostics (attempt %d): %v - waiting and retrying", i, err)
+			time.Sleep(time.Duration(i*i*i) * time.Second)
+		}
+
+		// send it
+		var resp *http.Response
+		resp, err = httpClient.Post(endpoint+instanceUUID.String(), "application/json", bytes.NewReader(payloadBytes))
+		if err != nil {
+			continue
+		}
+
+		// ensure we can read the response
+		if ct := resp.Header.Get("Content-Type"); (resp.StatusCode < 300 || resp.StatusCode >= 400) &&
+			!strings.Contains(ct, "json") {
+			err = fmt.Errorf("diagnostics server replied with unknown content-type: %s", ct)
+			resp.Body.Close()
+			continue
+		}
+
+		// read the response body
+		err = json.NewDecoder(resp.Body).Decode(&reply)
+		resp.Body.Close() // close response body as soon as we're done with it
+		if err != nil {
+			continue
+		}
+
+		// ensure we won't slam the diagnostics server
+		if reply.NextUpdate < 1*time.Second {
+			reply.NextUpdate = defaultUpdateInterval
+		}
+
+		// make sure we didn't send the update too soon; if so,
+		// just wait and try again -- this is a special case of
+		// error that we handle differently, as you can see
+		if resp.StatusCode == http.StatusTooManyRequests {
+			log.Printf("[NOTICE] Sending diagnostics: we were too early; waiting %s before trying again", reply.NextUpdate)
+			time.Sleep(reply.NextUpdate)
+			continue
+		} else if resp.StatusCode >= 400 {
+			err = fmt.Errorf("diagnostics server returned status code %d", resp.StatusCode)
+			continue
+		}
+
+		break
+	}
+	if err == nil {
+		// (remember, if there was an error, we return it
+		// below, so it will get logged if it's supposed to)
+		log.Println("[INFO] Sending diagnostics: success")
+	}
+
+	// even if there was an error after retrying, we should
+	// schedule the next update using our default update
+	// interval because the server might be healthy later
+
+	// schedule the next update (if this wasn't the last one and
+	// if the remote server didn't tell us to stop sending)
+	if !final && !reply.Stop {
+		updateTimerMu.Lock()
+		updateTimer = time.AfterFunc(reply.NextUpdate, func() {
+			logEmit(false)
+		})
+		updateTimerMu.Unlock()
+	}
+
+	return err
 }

+// makePayloadAndResetBuffer prepares a payload
+// by emptying the collection buffer. It returns
+// the bytes of the payload to send to the server.
+// Since the buffer is reset by this, if the
+// resulting byte slice is lost, the payload is
+// gone with it.
+func makePayloadAndResetBuffer() ([]byte, error) {
+	// make a local pointer to the buffer, then reset
+	// the buffer to an empty map to clear it out
+	bufferMu.Lock()
+	bufCopy := buffer
+	buffer = make(map[string]interface{})
+	bufferItemCount = 0
+	bufferMu.Unlock()
+
+	// encode payload in preparation for transmission
+	payload := Payload{
+		InstanceID: instanceUUID.String(),
+		Timestamp:  time.Now().UTC(),
+		Data:       bufCopy,
+	}
+	return json.Marshal(payload)
+}
+
+// Response contains the body of a response from the
+// diagnostics server.
+type Response struct {
+	// NextUpdate is how long to wait before the next update.
+	NextUpdate time.Duration `json:"next_update"`
+
+	// Stop instructs the diagnostics server to stop sending
+	// diagnostics. This would only be done under extenuating
+	// circumstances, but we are prepared for it nonetheless.
+	Stop bool `json:"stop,omitempty"`
+
+	// Error will be populated with an error message, if any.
+	// This field should be empty if the status code is < 400.
+	Error string `json:"error,omitempty"`
+}
+
+// Payload is the data that gets sent to the diagnostics server.
+type Payload struct {
+	// The universally unique ID of the instance
+	InstanceID string `json:"instance_id"`
+
+	// The UTC timestamp of the transmission
+	Timestamp time.Time `json:"timestamp"`
+
+	// The metrics
+	Data map[string]interface{} `json:"data,omitempty"`
+}
+
+// httpClient should be used for HTTP requests. It
+// is configured with a timeout for reliability.
+var httpClient = http.Client{Timeout: 1 * time.Minute}
+
+// buffer holds the data that we are building up to send.
+var buffer = make(map[string]interface{})
+var bufferItemCount = 0
+var bufferMu sync.RWMutex // protects both the buffer and its count
+
+// updating is used to ensure only one
+// update happens at a time.
+var updating bool
+var updateMu sync.Mutex
+
+// updateTimer fires off the next update.
+// If no update is scheduled, this is nil.
+var updateTimer *time.Timer
+var updateTimerMu sync.Mutex
+
+// instanceUUID is the ID of the current instance.
+// This MUST be set to emit diagnostics.
 var instanceUUID uuid.UUID
+
+// enabled indicates whether the package has
+// been initialized and can be actively used.
+var enabled bool
+
+const (
+	// endpoint is the base URL to remote diagnostics server;
+	// the instance ID will be appended to it.
+	endpoint = "http://localhost:8081/update/"
+
+	// defaultUpdateInterval is how long to wait before emitting
+	// more diagnostic data. This value is only used if the
+	// client receives a nonsensical value, or doesn't send one
+	// at all, indicating a likely problem with the server. Thus,
+	// this value should be a long duration to help alleviate
+	// extra load on the server.
+	defaultUpdateInterval = 1 * time.Hour
+
+	// maxBufferItems is the maximum number of items we'll allow
+	// in the buffer before we start dropping new ones, in a
+	// rough (simple) attempt to keep memory use under control.
+	maxBufferItems = 100000
+)
--- a/vendor/github.com/klauspost/cpuid/LICENSE
+++ b/vendor/github.com/klauspost/cpuid/LICENSE
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/vendor/github.com/klauspost/cpuid/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/cpuid.go
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// Package cpuid provides information about the CPU running the current program.
+//
+// CPU features are detected on startup, and kept for fast access through the life of the application.
+// Currently x86 / x64 (AMD64) is supported.
+//
+// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
+//
+// Package home: https://github.com/klauspost/cpuid
+package cpuid
+
+import "strings"
+
+// Vendor is a representation of a CPU vendor.
+type Vendor int
+
+const (
+	Other Vendor = iota
+	Intel
+	AMD
+	VIA
+	Transmeta
+	NSC
+	KVM  // Kernel-based Virtual Machine
+	MSVM // Microsoft Hyper-V or Windows Virtual PC
+	VMware
+	XenHVM
+)
+
+const (
+	CMOV        = 1 << iota // i686 CMOV
+	NX                      // NX (No-Execute) bit
+	AMD3DNOW                // AMD 3DNOW
+	AMD3DNOWEXT             // AMD 3DNowExt
+	MMX                     // standard MMX
+	MMXEXT                  // SSE integer functions or AMD MMX ext
+	SSE                     // SSE functions
+	SSE2                    // P4 SSE functions
+	SSE3                    // Prescott SSE3 functions
+	SSSE3                   // Conroe SSSE3 functions
+	SSE4                    // Penryn SSE4.1 functions
+	SSE4A                   // AMD Barcelona microarchitecture SSE4a instructions
+	SSE42                   // Nehalem SSE4.2 functions
+	AVX                     // AVX functions
+	AVX2                    // AVX2 functions
+	FMA3                    // Intel FMA 3
+	FMA4                    // Bulldozer FMA4 functions
+	XOP                     // Bulldozer XOP functions
+	F16C                    // Half-precision floating-point conversion
+	BMI1                    // Bit Manipulation Instruction Set 1
+	BMI2                    // Bit Manipulation Instruction Set 2
+	TBM                     // AMD Trailing Bit Manipulation
+	LZCNT                   // LZCNT instruction
+	POPCNT                  // POPCNT instruction
+	AESNI                   // Advanced Encryption Standard New Instructions
+	CLMUL                   // Carry-less Multiplication
+	HTT                     // Hyperthreading (enabled)
+	HLE                     // Hardware Lock Elision
+	RTM                     // Restricted Transactional Memory
+	RDRAND                  // RDRAND instruction is available
+	RDSEED                  // RDSEED instruction is available
+	ADX                     // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+	SHA                     // Intel SHA Extensions
+	AVX512F                 // AVX-512 Foundation
+	AVX512DQ                // AVX-512 Doubleword and Quadword Instructions
+	AVX512IFMA              // AVX-512 Integer Fused Multiply-Add Instructions
+	AVX512PF                // AVX-512 Prefetch Instructions
+	AVX512ER                // AVX-512 Exponential and Reciprocal Instructions
+	AVX512CD                // AVX-512 Conflict Detection Instructions
+	AVX512BW                // AVX-512 Byte and Word Instructions
+	AVX512VL                // AVX-512 Vector Length Extensions
+	AVX512VBMI              // AVX-512 Vector Bit Manipulation Instructions
+	MPX                     // Intel MPX (Memory Protection Extensions)
+	ERMS                    // Enhanced REP MOVSB/STOSB
+	RDTSCP                  // RDTSCP Instruction
+	CX16                    // CMPXCHG16B Instruction
+	SGX                     // Software Guard Extensions
+
+	// Performance indicators
+	SSE2SLOW // SSE2 is supported, but usually not faster
+	SSE3SLOW // SSE3 is supported, but usually not faster
+	ATOM     // Atom processor, some SSSE3 instructions are slower
+)
+
+var flagNames = map[Flags]string{
+	CMOV:        "CMOV",        // i686 CMOV
+	NX:          "NX",          // NX (No-Execute) bit
+	AMD3DNOW:    "AMD3DNOW",    // AMD 3DNOW
+	AMD3DNOWEXT: "AMD3DNOWEXT", // AMD 3DNowExt
+	MMX:         "MMX",         // Standard MMX
+	MMXEXT:      "MMXEXT",      // SSE integer functions or AMD MMX ext
+	SSE:         "SSE",         // SSE functions
+	SSE2:        "SSE2",        // P4 SSE2 functions
+	SSE3:        "SSE3",        // Prescott SSE3 functions
+	SSSE3:       "SSSE3",       // Conroe SSSE3 functions
+	SSE4:        "SSE4.1",      // Penryn SSE4.1 functions
+	SSE4A:       "SSE4A",       // AMD Barcelona microarchitecture SSE4a instructions
+	SSE42:       "SSE4.2",      // Nehalem SSE4.2 functions
+	AVX:         "AVX",         // AVX functions
+	AVX2:        "AVX2",        // AVX functions
+	FMA3:        "FMA3",        // Intel FMA 3
+	FMA4:        "FMA4",        // Bulldozer FMA4 functions
+	XOP:         "XOP",         // Bulldozer XOP functions
+	F16C:        "F16C",        // Half-precision floating-point conversion
+	BMI1:        "BMI1",        // Bit Manipulation Instruction Set 1
+	BMI2:        "BMI2",        // Bit Manipulation Instruction Set 2
+	TBM:         "TBM",         // AMD Trailing Bit Manipulation
+	LZCNT:       "LZCNT",       // LZCNT instruction
+	POPCNT:      "POPCNT",      // POPCNT instruction
+	AESNI:       "AESNI",       // Advanced Encryption Standard New Instructions
+	CLMUL:       "CLMUL",       // Carry-less Multiplication
+	HTT:         "HTT",         // Hyperthreading (enabled)
+	HLE:         "HLE",         // Hardware Lock Elision
+	RTM:         "RTM",         // Restricted Transactional Memory
+	RDRAND:      "RDRAND",      // RDRAND instruction is available
+	RDSEED:      "RDSEED",      // RDSEED instruction is available
+	ADX:         "ADX",         // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+	SHA:         "SHA",         // Intel SHA Extensions
+	AVX512F:     "AVX512F",     // AVX-512 Foundation
+	AVX512DQ:    "AVX512DQ",    // AVX-512 Doubleword and Quadword Instructions
+	AVX512IFMA:  "AVX512IFMA",  // AVX-512 Integer Fused Multiply-Add Instructions
+	AVX512PF:    "AVX512PF",    // AVX-512 Prefetch Instructions
+	AVX512ER:    "AVX512ER",    // AVX-512 Exponential and Reciprocal Instructions
+	AVX512CD:    "AVX512CD",    // AVX-512 Conflict Detection Instructions
+	AVX512BW:    "AVX512BW",    // AVX-512 Byte and Word Instructions
+	AVX512VL:    "AVX512VL",    // AVX-512 Vector Length Extensions
+	AVX512VBMI:  "AVX512VBMI",  // AVX-512 Vector Bit Manipulation Instructions
+	MPX:         "MPX",         // Intel MPX (Memory Protection Extensions)
+	ERMS:        "ERMS",        // Enhanced REP MOVSB/STOSB
+	RDTSCP:      "RDTSCP",      // RDTSCP Instruction
+	CX16:        "CX16",        // CMPXCHG16B Instruction
+	SGX:         "SGX",         // Software Guard Extensions
+
+	// Performance indicators
+	SSE2SLOW: "SSE2SLOW", // SSE2 supported, but usually not faster
+	SSE3SLOW: "SSE3SLOW", // SSE3 supported, but usually not faster
+	ATOM:     "ATOM",     // Atom processor, some SSSE3 instructions are slower
+
+}
+
+// CPUInfo contains information about the detected system CPU.
+type CPUInfo struct {
+	BrandName      string // Brand name reported by the CPU
+	VendorID       Vendor // Comparable CPU vendor ID
+	Features       Flags  // Features of the CPU
+	PhysicalCores  int    // Number of physical processor cores in your CPU. Will be 0 if undetectable.
+	ThreadsPerCore int    // Number of threads per physical core. Will be 1 if undetectable.
+	LogicalCores   int    // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
+	Family         int    // CPU family number
+	Model          int    // CPU model number
+	CacheLine      int    // Cache line size in bytes. Will be 0 if undetectable.
+	Cache          struct {
+		L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
+		L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
+		L2  int // L2 Cache (per core or shared). Will be -1 if undetected
+		L3  int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
+	}
+	SGX       SGXSupport
+	maxFunc   uint32
+	maxExFunc uint32
+}
+
+var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
+var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+var xgetbv func(index uint32) (eax, edx uint32)
+var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
+
+// CPU contains information about the CPU as detected on startup,
+// or when Detect last was called.
+//
+// Use this as the primary entry point to you data,
+// this way queries are
+var CPU CPUInfo
+
+func init() {
+	initCPU()
+	Detect()
+}
+
+// Detect will re-detect current CPU info.
+// This will replace the content of the exported CPU variable.
+//
+// Unless you expect the CPU to change while you are running your program
+// you should not need to call this function.
+// If you call this, you must ensure that no other goroutine is accessing the
+// exported CPU variable.
+func Detect() {
+	CPU.maxFunc = maxFunctionID()
+	CPU.maxExFunc = maxExtendedFunction()
+	CPU.BrandName = brandName()
+	CPU.CacheLine = cacheLine()
+	CPU.Family, CPU.Model = familyModel()
+	CPU.Features = support()
+	CPU.SGX = hasSGX(CPU.Features&SGX != 0)
+	CPU.ThreadsPerCore = threadsPerCore()
+	CPU.LogicalCores = logicalCores()
+	CPU.PhysicalCores = physicalCores()
+	CPU.VendorID = vendorID()
+	CPU.cacheSize()
+}
+
+// Generated here: http://play.golang.org/p/BxFH2Gdc0G
+
+// Cmov indicates support of CMOV instructions
+func (c CPUInfo) Cmov() bool {
+	return c.Features&CMOV != 0
+}
+
+// Amd3dnow indicates support of AMD 3DNOW! instructions
+func (c CPUInfo) Amd3dnow() bool {
+	return c.Features&AMD3DNOW != 0
+}
+
+// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
+func (c CPUInfo) Amd3dnowExt() bool {
+	return c.Features&AMD3DNOWEXT != 0
+}
+
+// MMX indicates support of MMX instructions
+func (c CPUInfo) MMX() bool {
+	return c.Features&MMX != 0
+}
+
+// MMXExt indicates support of MMXEXT instructions
+// (SSE integer functions or AMD MMX ext)
+func (c CPUInfo) MMXExt() bool {
+	return c.Features&MMXEXT != 0
+}
+
+// SSE indicates support of SSE instructions
+func (c CPUInfo) SSE() bool {
+	return c.Features&SSE != 0
+}
+
+// SSE2 indicates support of SSE 2 instructions
+func (c CPUInfo) SSE2() bool {
+	return c.Features&SSE2 != 0
+}
+
+// SSE3 indicates support of SSE 3 instructions
+func (c CPUInfo) SSE3() bool {
+	return c.Features&SSE3 != 0
+}
+
+// SSSE3 indicates support of SSSE 3 instructions
+func (c CPUInfo) SSSE3() bool {
+	return c.Features&SSSE3 != 0
+}
+
+// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
+func (c CPUInfo) SSE4() bool {
+	return c.Features&SSE4 != 0
+}
+
+// SSE42 indicates support of SSE4.2 instructions
+func (c CPUInfo) SSE42() bool {
+	return c.Features&SSE42 != 0
+}
+
+// AVX indicates support of AVX instructions
+// and operating system support of AVX instructions
+func (c CPUInfo) AVX() bool {
+	return c.Features&AVX != 0
+}
+
+// AVX2 indicates support of AVX2 instructions
+func (c CPUInfo) AVX2() bool {
+	return c.Features&AVX2 != 0
+}
+
+// FMA3 indicates support of FMA3 instructions
+func (c CPUInfo) FMA3() bool {
+	return c.Features&FMA3 != 0
+}
+
+// FMA4 indicates support of FMA4 instructions
+func (c CPUInfo) FMA4() bool {
+	return c.Features&FMA4 != 0
+}
+
+// XOP indicates support of XOP instructions
+func (c CPUInfo) XOP() bool {
+	return c.Features&XOP != 0
+}
+
+// F16C indicates support of F16C instructions
+func (c CPUInfo) F16C() bool {
+	return c.Features&F16C != 0
+}
+
+// BMI1 indicates support of BMI1 instructions
+func (c CPUInfo) BMI1() bool {
+	return c.Features&BMI1 != 0
+}
+
+// BMI2 indicates support of BMI2 instructions
+func (c CPUInfo) BMI2() bool {
+	return c.Features&BMI2 != 0
+}
+
+// TBM indicates support of TBM instructions
+// (AMD Trailing Bit Manipulation)
+func (c CPUInfo) TBM() bool {
+	return c.Features&TBM != 0
+}
+
+// Lzcnt indicates support of LZCNT instruction
+func (c CPUInfo) Lzcnt() bool {
+	return c.Features&LZCNT != 0
+}
+
+// Popcnt indicates support of POPCNT instruction
+func (c CPUInfo) Popcnt() bool {
+	return c.Features&POPCNT != 0
+}
+
+// HTT indicates the processor has Hyperthreading enabled
+func (c CPUInfo) HTT() bool {
+	return c.Features&HTT != 0
+}
+
+// SSE2Slow indicates that SSE2 may be slow on this processor
+func (c CPUInfo) SSE2Slow() bool {
+	return c.Features&SSE2SLOW != 0
+}
+
+// SSE3Slow indicates that SSE3 may be slow on this processor
+func (c CPUInfo) SSE3Slow() bool {
+	return c.Features&SSE3SLOW != 0
+}
+
+// AesNi indicates support of AES-NI instructions
+// (Advanced Encryption Standard New Instructions)
+func (c CPUInfo) AesNi() bool {
+	return c.Features&AESNI != 0
+}
+
+// Clmul indicates support of CLMUL instructions
+// (Carry-less Multiplication)
+func (c CPUInfo) Clmul() bool {
+	return c.Features&CLMUL != 0
+}
+
+// NX indicates support of NX (No-Execute) bit
+func (c CPUInfo) NX() bool {
+	return c.Features&NX != 0
+}
+
+// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
+func (c CPUInfo) SSE4A() bool {
+	return c.Features&SSE4A != 0
+}
+
+// HLE indicates support of Hardware Lock Elision
+func (c CPUInfo) HLE() bool {
+	return c.Features&HLE != 0
+}
+
+// RTM indicates support of Restricted Transactional Memory
+func (c CPUInfo) RTM() bool {
+	return c.Features&RTM != 0
+}
+
+// Rdrand indicates support of RDRAND instruction is available
+func (c CPUInfo) Rdrand() bool {
+	return c.Features&RDRAND != 0
+}
+
+// Rdseed indicates support of RDSEED instruction is available
+func (c CPUInfo) Rdseed() bool {
+	return c.Features&RDSEED != 0
+}
+
+// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+func (c CPUInfo) ADX() bool {
+	return c.Features&ADX != 0
+}
+
+// SHA indicates support of Intel SHA Extensions
+func (c CPUInfo) SHA() bool {
+	return c.Features&SHA != 0
+}
+
+// AVX512F indicates support of AVX-512 Foundation
+func (c CPUInfo) AVX512F() bool {
+	return c.Features&AVX512F != 0
+}
+
+// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
+func (c CPUInfo) AVX512DQ() bool {
+	return c.Features&AVX512DQ != 0
+}
+
+// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
+func (c CPUInfo) AVX512IFMA() bool {
+	return c.Features&AVX512IFMA != 0
+}
+
+// AVX512PF indicates support of AVX-512 Prefetch Instructions
+func (c CPUInfo) AVX512PF() bool {
+	return c.Features&AVX512PF != 0
+}
+
+// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
+func (c CPUInfo) AVX512ER() bool {
+	return c.Features&AVX512ER != 0
+}
+
+// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
+func (c CPUInfo) AVX512CD() bool {
+	return c.Features&AVX512CD != 0
+}
+
+// AVX512BW indicates support of AVX-512 Byte and Word Instructions
+func (c CPUInfo) AVX512BW() bool {
+	return c.Features&AVX512BW != 0
+}
+
+// AVX512VL indicates support of AVX-512 Vector Length Extensions
+func (c CPUInfo) AVX512VL() bool {
+	return c.Features&AVX512VL != 0
+}
+
+// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
+func (c CPUInfo) AVX512VBMI() bool {
+	return c.Features&AVX512VBMI != 0
+}
+
+// MPX indicates support of Intel MPX (Memory Protection Extensions)
+func (c CPUInfo) MPX() bool {
+	return c.Features&MPX != 0
+}
+
+// ERMS indicates support of Enhanced REP MOVSB/STOSB
+func (c CPUInfo) ERMS() bool {
+	return c.Features&ERMS != 0
+}
+
+// RDTSCP Instruction is available.
+func (c CPUInfo) RDTSCP() bool {
+	return c.Features&RDTSCP != 0
+}
+
+// CX16 indicates if CMPXCHG16B instruction is available.
+func (c CPUInfo) CX16() bool {
+	return c.Features&CX16 != 0
+}
+
+// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection.
+// So TSX simply checks that.
+func (c CPUInfo) TSX() bool {
+	return c.Features&(MPX|RTM) == MPX|RTM
+}
+
+// Atom indicates an Atom processor
+func (c CPUInfo) Atom() bool {
+	return c.Features&ATOM != 0
+}
+
+// Intel returns true if vendor is recognized as Intel
+func (c CPUInfo) Intel() bool {
+	return c.VendorID == Intel
+}
+
+// AMD returns true if vendor is recognized as AMD
+func (c CPUInfo) AMD() bool {
+	return c.VendorID == AMD
+}
+
+// Transmeta returns true if vendor is recognized as Transmeta
+func (c CPUInfo) Transmeta() bool {
+	return c.VendorID == Transmeta
+}
+
+// NSC returns true if vendor is recognized as National Semiconductor
+func (c CPUInfo) NSC() bool {
+	return c.VendorID == NSC
+}
+
+// VIA returns true if vendor is recognized as VIA
+func (c CPUInfo) VIA() bool {
+	return c.VendorID == VIA
+}
+
+// RTCounter returns the 64-bit time-stamp counter
+// Uses the RDTSCP instruction. The value 0 is returned
+// if the CPU does not support the instruction.
+func (c CPUInfo) RTCounter() uint64 {
+	if !c.RDTSCP() {
+		return 0
+	}
+	a, _, _, d := rdtscpAsm()
+	return uint64(a) | (uint64(d) << 32)
+}
+
+// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
+// This variable is OS dependent, but on Linux contains information
+// about the current cpu/core the code is running on.
+// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
+func (c CPUInfo) Ia32TscAux() uint32 {
+	if !c.RDTSCP() {
+		return 0
+	}
+	_, _, ecx, _ := rdtscpAsm()
+	return ecx
+}
+
+// LogicalCPU will return the Logical CPU the code is currently executing on.
+// This is likely to change when the OS re-schedules the running thread
+// to another CPU.
+// If the current core cannot be detected, -1 will be returned.
+func (c CPUInfo) LogicalCPU() int {
+	if c.maxFunc < 1 {
+		return -1
+	}
+	_, ebx, _, _ := cpuid(1)
+	return int(ebx >> 24)
+}
+
+// VM Will return true if the cpu id indicates we are in
+// a virtual machine. This is only a hint, and will very likely
+// have many false negatives.
+func (c CPUInfo) VM() bool {
+	switch c.VendorID {
+	case MSVM, KVM, VMware, XenHVM:
+		return true
+	}
+	return false
+}
+
+// Flags contains detected cpu features and caracteristics
+type Flags uint64
+
+// String returns a string representation of the detected
+// CPU features.
+func (f Flags) String() string {
+	return strings.Join(f.Strings(), ",")
+}
+
+// Strings returns and array of the detected features.
+func (f Flags) Strings() []string {
+	s := support()
+	r := make([]string, 0, 20)
+	for i := uint(0); i < 64; i++ {
+		key := Flags(1 << i)
+		val := flagNames[key]
+		if s&key != 0 {
+			r = append(r, val)
+		}
+	}
+	return r
+}
+
+func maxExtendedFunction() uint32 {
+	eax, _, _, _ := cpuid(0x80000000)
+	return eax
+}
+
+func maxFunctionID() uint32 {
+	a, _, _, _ := cpuid(0)
+	return a
+}
+
+func brandName() string {
+	if maxExtendedFunction() >= 0x80000004 {
+		v := make([]uint32, 0, 48)
+		for i := uint32(0); i < 3; i++ {
+			a, b, c, d := cpuid(0x80000002 + i)
+			v = append(v, a, b, c, d)
+		}
+		return strings.Trim(string(valAsString(v...)), " ")
+	}
+	return "unknown"
+}
+
+func threadsPerCore() int {
+	mfi := maxFunctionID()
+	if mfi < 0x4 || vendorID() != Intel {
+		return 1
+	}
+
+	if mfi < 0xb {
+		_, b, _, d := cpuid(1)
+		if (d & (1 << 28)) != 0 {
+			// v will contain logical core count
+			v := (b >> 16) & 255
+			if v > 1 {
+				a4, _, _, _ := cpuid(4)
+				// physical cores
+				v2 := (a4 >> 26) + 1
+				if v2 > 0 {
+					return int(v) / int(v2)
+				}
+			}
+		}
+		return 1
+	}
+	_, b, _, _ := cpuidex(0xb, 0)
+	if b&0xffff == 0 {
+		return 1
+	}
+	return int(b & 0xffff)
+}
+
+func logicalCores() int {
+	mfi := maxFunctionID()
+	switch vendorID() {
+	case Intel:
+		// Use this on old Intel processors
+		if mfi < 0xb {
+			if mfi < 1 {
+				return 0
+			}
+			// CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
+			// that can be assigned to logical processors in a physical package.
+			// The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
+			_, ebx, _, _ := cpuid(1)
+			logical := (ebx >> 16) & 0xff
+			return int(logical)
+		}
+		_, b, _, _ := cpuidex(0xb, 1)
+		return int(b & 0xffff)
+	case AMD:
+		_, b, _, _ := cpuid(1)
+		return int((b >> 16) & 0xff)
+	default:
+		return 0
+	}
+}
+
+func familyModel() (int, int) {
+	if maxFunctionID() < 0x1 {
+		return 0, 0
+	}
+	eax, _, _, _ := cpuid(1)
+	family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
+	model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
+	return int(family), int(model)
+}
+
+func physicalCores() int {
+	switch vendorID() {
+	case Intel:
+		return logicalCores() / threadsPerCore()
+	case AMD:
+		if maxExtendedFunction() >= 0x80000008 {
+			_, _, c, _ := cpuid(0x80000008)
+			return int(c&0xff) + 1
+		}
+	}
+	return 0
+}
+
+// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
+var vendorMapping = map[string]Vendor{
+	"AMDisbetter!": AMD,
+	"AuthenticAMD": AMD,
+	"CentaurHauls": VIA,
+	"GenuineIntel": Intel,
+	"TransmetaCPU": Transmeta,
+	"GenuineTMx86": Transmeta,
+	"Geode by NSC": NSC,
+	"VIA VIA VIA ": VIA,
+	"KVMKVMKVMKVM": KVM,
+	"Microsoft Hv": MSVM,
+	"VMwareVMware": VMware,
+	"XenVMMXenVMM": XenHVM,
+}
+
+func vendorID() Vendor {
+	_, b, c, d := cpuid(0)
+	v := valAsString(b, d, c)
+	vend, ok := vendorMapping[string(v)]
+	if !ok {
+		return Other
+	}
+	return vend
+}
+
+func cacheLine() int {
+	if maxFunctionID() < 0x1 {
+		return 0
+	}
+
+	_, ebx, _, _ := cpuid(1)
+	cache := (ebx & 0xff00) >> 5 // cflush size
+	if cache == 0 && maxExtendedFunction() >= 0x80000006 {
+		_, _, ecx, _ := cpuid(0x80000006)
+		cache = ecx & 0xff // cacheline size
+	}
+	// TODO: Read from Cache and TLB Information
+	return int(cache)
+}
+
+func (c *CPUInfo) cacheSize() {
+	c.Cache.L1D = -1
+	c.Cache.L1I = -1
+	c.Cache.L2 = -1
+	c.Cache.L3 = -1
+	vendor := vendorID()
+	switch vendor {
+	case Intel:
+		if maxFunctionID() < 4 {
+			return
+		}
+		for i := uint32(0); ; i++ {
+			eax, ebx, ecx, _ := cpuidex(4, i)
+			cacheType := eax & 15
+			if cacheType == 0 {
+				break
+			}
+			cacheLevel := (eax >> 5) & 7
+			coherency := int(ebx&0xfff) + 1
+			partitions := int((ebx>>12)&0x3ff) + 1
+			associativity := int((ebx>>22)&0x3ff) + 1
+			sets := int(ecx) + 1
+			size := associativity * partitions * coherency * sets
+			switch cacheLevel {
+			case 1:
+				if cacheType == 1 {
+					// 1 = Data Cache
+					c.Cache.L1D = size
+				} else if cacheType == 2 {
+					// 2 = Instruction Cache
+					c.Cache.L1I = size
+				} else {
+					if c.Cache.L1D < 0 {
+						c.Cache.L1I = size
+					}
+					if c.Cache.L1I < 0 {
+						c.Cache.L1I = size
+					}
+				}
+			case 2:
+				c.Cache.L2 = size
+			case 3:
+				c.Cache.L3 = size
+			}
+		}
+	case AMD:
+		// Untested.
+		if maxExtendedFunction() < 0x80000005 {
+			return
+		}
+		_, _, ecx, edx := cpuid(0x80000005)
+		c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
+		c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
+
+		if maxExtendedFunction() < 0x80000006 {
+			return
+		}
+		_, _, ecx, _ = cpuid(0x80000006)
+		c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
+	}
+
+	return
+}
+
+type SGXSupport struct {
+	Available           bool
+	SGX1Supported       bool
+	SGX2Supported       bool
+	MaxEnclaveSizeNot64 int64
+	MaxEnclaveSize64    int64
+}
+
+func hasSGX(available bool) (rval SGXSupport) {
+	rval.Available = available
+
+	if !available {
+		return
+	}
+
+	a, _, _, d := cpuidex(0x12, 0)
+	rval.SGX1Supported = a&0x01 != 0
+	rval.SGX2Supported = a&0x02 != 0
+	rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF)     // pow 2
+	rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
+
+	return
+}
+
+func support() Flags {
+	mfi := maxFunctionID()
+	vend := vendorID()
+	if mfi < 0x1 {
+		return 0
+	}
+	rval := uint64(0)
+	_, _, c, d := cpuid(1)
+	if (d & (1 << 15)) != 0 {
+		rval |= CMOV
+	}
+	if (d & (1 << 23)) != 0 {
+		rval |= MMX
+	}
+	if (d & (1 << 25)) != 0 {
+		rval |= MMXEXT
+	}
+	if (d & (1 << 25)) != 0 {
+		rval |= SSE
+	}
+	if (d & (1 << 26)) != 0 {
+		rval |= SSE2
+	}
+	if (c & 1) != 0 {
+		rval |= SSE3
+	}
+	if (c & 0x00000200) != 0 {
+		rval |= SSSE3
+	}
+	if (c & 0x00080000) != 0 {
+		rval |= SSE4
+	}
+	if (c & 0x00100000) != 0 {
+		rval |= SSE42
+	}
+	if (c & (1 << 25)) != 0 {
+		rval |= AESNI
+	}
+	if (c & (1 << 1)) != 0 {
+		rval |= CLMUL
+	}
+	if c&(1<<23) != 0 {
+		rval |= POPCNT
+	}
+	if c&(1<<30) != 0 {
+		rval |= RDRAND
+	}
+	if c&(1<<29) != 0 {
+		rval |= F16C
+	}
+	if c&(1<<13) != 0 {
+		rval |= CX16
+	}
+	if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
+		if threadsPerCore() > 1 {
+			rval |= HTT
+		}
+	}
+
+	// Check XGETBV, OXSAVE and AVX bits
+	if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
+		// Check for OS support
+		eax, _ := xgetbv(0)
+		if (eax & 0x6) == 0x6 {
+			rval |= AVX
+			if (c & 0x00001000) != 0 {
+				rval |= FMA3
+			}
+		}
+	}
+
+	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
+	if mfi >= 7 {
+		_, ebx, ecx, _ := cpuidex(7, 0)
+		if (rval&AVX) != 0 && (ebx&0x00000020) != 0 {
+			rval |= AVX2
+		}
+		if (ebx & 0x00000008) != 0 {
+			rval |= BMI1
+			if (ebx & 0x00000100) != 0 {
+				rval |= BMI2
+			}
+		}
+		if ebx&(1<<2) != 0 {
+			rval |= SGX
+		}
+		if ebx&(1<<4) != 0 {
+			rval |= HLE
+		}
+		if ebx&(1<<9) != 0 {
+			rval |= ERMS
+		}
+		if ebx&(1<<11) != 0 {
+			rval |= RTM
+		}
+		if ebx&(1<<14) != 0 {
+			rval |= MPX
+		}
+		if ebx&(1<<18) != 0 {
+			rval |= RDSEED
+		}
+		if ebx&(1<<19) != 0 {
+			rval |= ADX
+		}
+		if ebx&(1<<29) != 0 {
+			rval |= SHA
+		}
+
+		// Only detect AVX-512 features if XGETBV is supported
+		if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
+			// Check for OS support
+			eax, _ := xgetbv(0)
+
+			// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+			// ZMM16-ZMM31 state are enabled by OS)
+			/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
+			if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
+				if ebx&(1<<16) != 0 {
+					rval |= AVX512F
+				}
+				if ebx&(1<<17) != 0 {
+					rval |= AVX512DQ
+				}
+				if ebx&(1<<21) != 0 {
+					rval |= AVX512IFMA
+				}
+				if ebx&(1<<26) != 0 {
+					rval |= AVX512PF
+				}
+				if ebx&(1<<27) != 0 {
+					rval |= AVX512ER
+				}
+				if ebx&(1<<28) != 0 {
+					rval |= AVX512CD
+				}
+				if ebx&(1<<30) != 0 {
+					rval |= AVX512BW
+				}
+				if ebx&(1<<31) != 0 {
+					rval |= AVX512VL
+				}
+				// ecx
+				if ecx&(1<<1) != 0 {
+					rval |= AVX512VBMI
+				}
+			}
+		}
+	}
+
+	if maxExtendedFunction() >= 0x80000001 {
+		_, _, c, d := cpuid(0x80000001)
+		if (c & (1 << 5)) != 0 {
+			rval |= LZCNT
+			rval |= POPCNT
+		}
+		if (d & (1 << 31)) != 0 {
+			rval |= AMD3DNOW
+		}
+		if (d & (1 << 30)) != 0 {
+			rval |= AMD3DNOWEXT
+		}
+		if (d & (1 << 23)) != 0 {
+			rval |= MMX
+		}
+		if (d & (1 << 22)) != 0 {
+			rval |= MMXEXT
+		}
+		if (c & (1 << 6)) != 0 {
+			rval |= SSE4A
+		}
+		if d&(1<<20) != 0 {
+			rval |= NX
+		}
+		if d&(1<<27) != 0 {
+			rval |= RDTSCP
+		}
+
+		/* Allow for selectively disabling SSE2 functions on AMD processors
+		   with SSE2 support but not SSE4a. This includes Athlon64, some
+		   Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
+		   than SSE2 often enough to utilize this special-case flag.
+		   AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
+		   so that SSE2 is used unless explicitly disabled by checking
+		   AV_CPU_FLAG_SSE2SLOW. */
+		if vendorID() != Intel &&
+			rval&SSE2 != 0 && (c&0x00000040) == 0 {
+			rval |= SSE2SLOW
+		}
+
+		/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
+		 * used unless the OS has AVX support. */
+		if (rval & AVX) != 0 {
+			if (c & 0x00000800) != 0 {
+				rval |= XOP
+			}
+			if (c & 0x00010000) != 0 {
+				rval |= FMA4
+			}
+		}
+
+		if vendorID() == Intel {
+			family, model := familyModel()
+			if family == 6 && (model == 9 || model == 13 || model == 14) {
+				/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
+				 * 6/14 (core1 "yonah") theoretically support sse2, but it's
+				 * usually slower than mmx. */
+				if (rval & SSE2) != 0 {
+					rval |= SSE2SLOW
+				}
+				if (rval & SSE3) != 0 {
+					rval |= SSE3SLOW
+				}
+			}
+			/* The Atom processor has SSSE3 support, which is useful in many cases,
+			 * but sometimes the SSSE3 version is slower than the SSE2 equivalent
+			 * on the Atom, but is generally faster on other processors supporting
+			 * SSSE3. This flag allows for selectively disabling certain SSSE3
+			 * functions on the Atom. */
+			if family == 6 && model == 28 {
+				rval |= ATOM
+			}
+		}
+	}
+	return Flags(rval)
+}
+
+func valAsString(values ...uint32) []byte {
+	r := make([]byte, 4*len(values))
+	for i, v := range values {
+		dst := r[i*4:]
+		dst[0] = byte(v & 0xff)
+		dst[1] = byte((v >> 8) & 0xff)
+		dst[2] = byte((v >> 16) & 0xff)
+		dst[3] = byte((v >> 24) & 0xff)
+		switch {
+		case dst[0] == 0:
+			return r[:i*4]
+		case dst[1] == 0:
+			return r[:i*4+1]
+		case dst[2] == 0:
+			return r[:i*4+2]
+		case dst[3] == 0:
+			return r[:i*4+3]
+		}
+	}
+	return r
+}
--- a/vendor/github.com/klauspost/cpuid/cpuid_386.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_386.s
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORL CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+4(FP)
+	MOVL BX, ebx+8(FP)
+	MOVL CX, ecx+12(FP)
+	MOVL DX, edx+16(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+4(FP)
+	MOVL DX, edx+8(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+//+build amd64,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORQ CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmXgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/detect_intel.go
+++ b/vendor/github.com/klauspost/cpuid/detect_intel.go
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo amd64,!gccgo
+
+package cpuid
+
+func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func asmXgetbv(index uint32) (eax, edx uint32)
+func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+
+func initCPU() {
+	cpuid = asmCpuid
+	cpuidex = asmCpuidex
+	xgetbv = asmXgetbv
+	rdtscpAsm = asmRdtscpAsm
+}
--- a/vendor/github.com/klauspost/cpuid/detect_ref.go
+++ b/vendor/github.com/klauspost/cpuid/detect_ref.go
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build !amd64,!386 gccgo
+
+package cpuid
+
+func initCPU() {
+	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	xgetbv = func(index uint32) (eax, edx uint32) {
+		return 0, 0
+	}
+
+	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+}
--- a/vendor/github.com/klauspost/cpuid/generate.go
+++ b/vendor/github.com/klauspost/cpuid/generate.go
+package cpuid
+
+//go:generate go run private-gen.go
+//go:generate gofmt -w ./private
--- a/vendor/github.com/klauspost/cpuid/private-gen.go
+++ b/vendor/github.com/klauspost/cpuid/private-gen.go
+// +build ignore
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/printer"
+	"go/token"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"reflect"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var inFiles = []string{"cpuid.go", "cpuid_test.go"}
+var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"}
+var fileSet = token.NewFileSet()
+var reWrites = []rewrite{
+	initRewrite("CPUInfo -> cpuInfo"),
+	initRewrite("Vendor -> vendor"),
+	initRewrite("Flags -> flags"),
+	initRewrite("Detect -> detect"),
+	initRewrite("CPU -> cpu"),
+}
+var excludeNames = map[string]bool{"string": true, "join": true, "trim": true,
+	// cpuid_test.go
+	"t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true,
+}
+
+var excludePrefixes = []string{"test", "benchmark"}
+
+func main() {
+	Package := "private"
+	parserMode := parser.ParseComments
+	exported := make(map[string]rewrite)
+	for _, file := range inFiles {
+		in, err := os.Open(file)
+		if err != nil {
+			log.Fatalf("opening input", err)
+		}
+
+		src, err := ioutil.ReadAll(in)
+		if err != nil {
+			log.Fatalf("reading input", err)
+		}
+
+		astfile, err := parser.ParseFile(fileSet, file, src, parserMode)
+		if err != nil {
+			log.Fatalf("parsing input", err)
+		}
+
+		for _, rw := range reWrites {
+			astfile = rw(astfile)
+		}
+
+		// Inspect the AST and print all identifiers and literals.
+		var startDecl token.Pos
+		var endDecl token.Pos
+		ast.Inspect(astfile, func(n ast.Node) bool {
+			var s string
+			switch x := n.(type) {
+			case *ast.Ident:
+				if x.IsExported() {
+					t := strings.ToLower(x.Name)
+					for _, pre := range excludePrefixes {
+						if strings.HasPrefix(t, pre) {
+							return true
+						}
+					}
+					if excludeNames[t] != true {
+						//if x.Pos() > startDecl && x.Pos() < endDecl {
+						exported[x.Name] = initRewrite(x.Name + " -> " + t)
+					}
+				}
+
+			case *ast.GenDecl:
+				if x.Tok == token.CONST && x.Lparen > 0 {
+					startDecl = x.Lparen
+					endDecl = x.Rparen
+					// fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl))
+				}
+			}
+			if s != "" {
+				fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s)
+			}
+			return true
+		})
+
+		for _, rw := range exported {
+			astfile = rw(astfile)
+		}
+
+		var buf bytes.Buffer
+
+		printer.Fprint(&buf, fileSet, astfile)
+
+		// Remove package documentation and insert information
+		s := buf.String()
+		ind := strings.Index(buf.String(), "\npackage cpuid")
+		s = s[ind:]
+		s = "// Generated, DO NOT EDIT,\n" +
+			"// but copy it to your own project and rename the package.\n" +
+			"// See more at http://github.com/klauspost/cpuid\n" +
+			s
+
+		outputName := Package + string(os.PathSeparator) + file
+
+		err = ioutil.WriteFile(outputName, []byte(s), 0644)
+		if err != nil {
+			log.Fatalf("writing output: %s", err)
+		}
+		log.Println("Generated", outputName)
+	}
+
+	for _, file := range copyFiles {
+		dst := ""
+		if strings.HasPrefix(file, "cpuid") {
+			dst = Package + string(os.PathSeparator) + file
+		} else {
+			dst = Package + string(os.PathSeparator) + "cpuid_" + file
+		}
+		err := copyFile(file, dst)
+		if err != nil {
+			log.Fatalf("copying file: %s", err)
+		}
+		log.Println("Copied", dst)
+	}
+}
+
+// CopyFile copies a file from src to dst. If src and dst files exist, and are
+// the same, then return success. Copy the file contents from src to dst.
+func copyFile(src, dst string) (err error) {
+	sfi, err := os.Stat(src)
+	if err != nil {
+		return
+	}
+	if !sfi.Mode().IsRegular() {
+		// cannot copy non-regular files (e.g., directories,
+		// symlinks, devices, etc.)
+		return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String())
+	}
+	dfi, err := os.Stat(dst)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return
+		}
+	} else {
+		if !(dfi.Mode().IsRegular()) {
+			return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String())
+		}
+		if os.SameFile(sfi, dfi) {
+			return
+		}
+	}
+	err = copyFileContents(src, dst)
+	return
+}
+
+// copyFileContents copies the contents of the file named src to the file named
+// by dst. The file will be created if it does not already exist. If the
+// destination file exists, all it's contents will be replaced by the contents
+// of the source file.
+func copyFileContents(src, dst string) (err error) {
+	in, err := os.Open(src)
+	if err != nil {
+		return
+	}
+	defer in.Close()
+	out, err := os.Create(dst)
+	if err != nil {
+		return
+	}
+	defer func() {
+		cerr := out.Close()
+		if err == nil {
+			err = cerr
+		}
+	}()
+	if _, err = io.Copy(out, in); err != nil {
+		return
+	}
+	err = out.Sync()
+	return
+}
+
+type rewrite func(*ast.File) *ast.File
+
+// Mostly copied from gofmt
+func initRewrite(rewriteRule string) rewrite {
+	f := strings.Split(rewriteRule, "->")
+	if len(f) != 2 {
+		fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n")
+		os.Exit(2)
+	}
+	pattern := parseExpr(f[0], "pattern")
+	replace := parseExpr(f[1], "replacement")
+	return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) }
+}
+
+// parseExpr parses s as an expression.
+// It might make sense to expand this to allow statement patterns,
+// but there are problems with preserving formatting and also
+// with what a wildcard for a statement looks like.
+func parseExpr(s, what string) ast.Expr {
+	x, err := parser.ParseExpr(s)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err)
+		os.Exit(2)
+	}
+	return x
+}
+
+// Keep this function for debugging.
+/*
+func dump(msg string, val reflect.Value) {
+	fmt.Printf("%s:\n", msg)
+	ast.Print(fileSet, val.Interface())
+	fmt.Println()
+}
+*/
+
+// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file.
+func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File {
+	cmap := ast.NewCommentMap(fileSet, p, p.Comments)
+	m := make(map[string]reflect.Value)
+	pat := reflect.ValueOf(pattern)
+	repl := reflect.ValueOf(replace)
+
+	var rewriteVal func(val reflect.Value) reflect.Value
+	rewriteVal = func(val reflect.Value) reflect.Value {
+		// don't bother if val is invalid to start with
+		if !val.IsValid() {
+			return reflect.Value{}
+		}
+		for k := range m {
+			delete(m, k)
+		}
+		val = apply(rewriteVal, val)
+		if match(m, pat, val) {
+			val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos()))
+		}
+		return val
+	}
+
+	r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File)
+	r.Comments = cmap.Filter(r).Comments() // recreate comments list
+	return r
+}
+
+// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y.
+func set(x, y reflect.Value) {
+	// don't bother if x cannot be set or y is invalid
+	if !x.CanSet() || !y.IsValid() {
+		return
+	}
+	defer func() {
+		if x := recover(); x != nil {
+			if s, ok := x.(string); ok &&
+				(strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) {
+				// x cannot be set to y - ignore this rewrite
+				return
+			}
+			panic(x)
+		}
+	}()
+	x.Set(y)
+}
+
+// Values/types for special cases.
+var (
+	objectPtrNil = reflect.ValueOf((*ast.Object)(nil))
+	scopePtrNil  = reflect.ValueOf((*ast.Scope)(nil))
+
+	identType     = reflect.TypeOf((*ast.Ident)(nil))
+	objectPtrType = reflect.TypeOf((*ast.Object)(nil))
+	positionType  = reflect.TypeOf(token.NoPos)
+	callExprType  = reflect.TypeOf((*ast.CallExpr)(nil))
+	scopePtrType  = reflect.TypeOf((*ast.Scope)(nil))
+)
+
+// apply replaces each AST field x in val with f(x), returning val.
+// To avoid extra conversions, f operates on the reflect.Value form.
+func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value {
+	if !val.IsValid() {
+		return reflect.Value{}
+	}
+
+	// *ast.Objects introduce cycles and are likely incorrect after
+	// rewrite; don't follow them but replace with nil instead
+	if val.Type() == objectPtrType {
+		return objectPtrNil
+	}
+
+	// similarly for scopes: they are likely incorrect after a rewrite;
+	// replace them with nil
+	if val.Type() == scopePtrType {
+		return scopePtrNil
+	}
+
+	switch v := reflect.Indirect(val); v.Kind() {
+	case reflect.Slice:
+		for i := 0; i < v.Len(); i++ {
+			e := v.Index(i)
+			set(e, f(e))
+		}
+	case reflect.Struct:
+		for i := 0; i < v.NumField(); i++ {
+			e := v.Field(i)
+			set(e, f(e))
+		}
+	case reflect.Interface:
+		e := v.Elem()
+		set(v, f(e))
+	}
+	return val
+}
+
+func isWildcard(s string) bool {
+	rune, size := utf8.DecodeRuneInString(s)
+	return size == len(s) && unicode.IsLower(rune)
+}
+
+// match returns true if pattern matches val,
+// recording wildcard submatches in m.
+// If m == nil, match checks whether pattern == val.
+func match(m map[string]reflect.Value, pattern, val reflect.Value) bool {
+	// Wildcard matches any expression.  If it appears multiple
+	// times in the pattern, it must match the same expression
+	// each time.
+	if m != nil && pattern.IsValid() && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) && val.IsValid() {
+			// wildcards only match valid (non-nil) expressions.
+			if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() {
+				if old, ok := m[name]; ok {
+					return match(nil, old, val)
+				}
+				m[name] = val
+				return true
+			}
+		}
+	}
+
+	// Otherwise, pattern and val must match recursively.
+	if !pattern.IsValid() || !val.IsValid() {
+		return !pattern.IsValid() && !val.IsValid()
+	}
+	if pattern.Type() != val.Type() {
+		return false
+	}
+
+	// Special cases.
+	switch pattern.Type() {
+	case identType:
+		// For identifiers, only the names need to match
+		// (and none of the other *ast.Object information).
+		// This is a common case, handle it all here instead
+		// of recursing down any further via reflection.
+		p := pattern.Interface().(*ast.Ident)
+		v := val.Interface().(*ast.Ident)
+		return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name
+	case objectPtrType, positionType:
+		// object pointers and token positions always match
+		return true
+	case callExprType:
+		// For calls, the Ellipsis fields (token.Position) must
+		// match since that is how f(x) and f(x...) are different.
+		// Check them here but fall through for the remaining fields.
+		p := pattern.Interface().(*ast.CallExpr)
+		v := val.Interface().(*ast.CallExpr)
+		if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() {
+			return false
+		}
+	}
+
+	p := reflect.Indirect(pattern)
+	v := reflect.Indirect(val)
+	if !p.IsValid() || !v.IsValid() {
+		return !p.IsValid() && !v.IsValid()
+	}
+
+	switch p.Kind() {
+	case reflect.Slice:
+		if p.Len() != v.Len() {
+			return false
+		}
+		for i := 0; i < p.Len(); i++ {
+			if !match(m, p.Index(i), v.Index(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Struct:
+		for i := 0; i < p.NumField(); i++ {
+			if !match(m, p.Field(i), v.Field(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Interface:
+		return match(m, p.Elem(), v.Elem())
+	}
+
+	// Handle token integers, etc.
+	return p.Interface() == v.Interface()
+}
+
+// subst returns a copy of pattern with values from m substituted in place
+// of wildcards and pos used as the position of tokens from the pattern.
+// if m == nil, subst returns a copy of pattern and doesn't change the line
+// number information.
+func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value {
+	if !pattern.IsValid() {
+		return reflect.Value{}
+	}
+
+	// Wildcard gets replaced with map value.
+	if m != nil && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) {
+			if old, ok := m[name]; ok {
+				return subst(nil, old, reflect.Value{})
+			}
+		}
+	}
+
+	if pos.IsValid() && pattern.Type() == positionType {
+		// use new position only if old position was valid in the first place
+		if old := pattern.Interface().(token.Pos); !old.IsValid() {
+			return pattern
+		}
+		return pos
+	}
+
+	// Otherwise copy.
+	switch p := pattern; p.Kind() {
+	case reflect.Slice:
+		v := reflect.MakeSlice(p.Type(), p.Len(), p.Len())
+		for i := 0; i < p.Len(); i++ {
+			v.Index(i).Set(subst(m, p.Index(i), pos))
+		}
+		return v
+
+	case reflect.Struct:
+		v := reflect.New(p.Type()).Elem()
+		for i := 0; i < p.NumField(); i++ {
+			v.Field(i).Set(subst(m, p.Field(i), pos))
+		}
+		return v
+
+	case reflect.Ptr:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos).Addr())
+		}
+		return v
+
+	case reflect.Interface:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos))
+		}
+		return v
+	}
+
+	return pattern
+}
--- a/vendor/github.com/klauspost/cpuid/private/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/private/cpuid.go
+// Generated, DO NOT EDIT,
+// but copy it to your own project and rename the package.
+// See more at http://github.com/klauspost/cpuid
+
+package cpuid
+
+import "strings"
+
+// Vendor is a representation of a CPU vendor.
+type vendor int
+
+const (
+	other vendor = iota
+	intel
+	amd
+	via
+	transmeta
+	nsc
+	kvm  // Kernel-based Virtual Machine
+	msvm // Microsoft Hyper-V or Windows Virtual PC
+	vmware
+	xenhvm
+)
+
+const (
+	cmov        = 1 << iota // i686 CMOV
+	nx                      // NX (No-Execute) bit
+	amd3dnow                // AMD 3DNOW
+	amd3dnowext             // AMD 3DNowExt
+	mmx                     // standard MMX
+	mmxext                  // SSE integer functions or AMD MMX ext
+	sse                     // SSE functions
+	sse2                    // P4 SSE functions
+	sse3                    // Prescott SSE3 functions
+	ssse3                   // Conroe SSSE3 functions
+	sse4                    // Penryn SSE4.1 functions
+	sse4a                   // AMD Barcelona microarchitecture SSE4a instructions
+	sse42                   // Nehalem SSE4.2 functions
+	avx                     // AVX functions
+	avx2                    // AVX2 functions
+	fma3                    // Intel FMA 3
+	fma4                    // Bulldozer FMA4 functions
+	xop                     // Bulldozer XOP functions
+	f16c                    // Half-precision floating-point conversion
+	bmi1                    // Bit Manipulation Instruction Set 1
+	bmi2                    // Bit Manipulation Instruction Set 2
+	tbm                     // AMD Trailing Bit Manipulation
+	lzcnt                   // LZCNT instruction
+	popcnt                  // POPCNT instruction
+	aesni                   // Advanced Encryption Standard New Instructions
+	clmul                   // Carry-less Multiplication
+	htt                     // Hyperthreading (enabled)
+	hle                     // Hardware Lock Elision
+	rtm                     // Restricted Transactional Memory
+	rdrand                  // RDRAND instruction is available
+	rdseed                  // RDSEED instruction is available
+	adx                     // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+	sha                     // Intel SHA Extensions
+	avx512f                 // AVX-512 Foundation
+	avx512dq                // AVX-512 Doubleword and Quadword Instructions
+	avx512ifma              // AVX-512 Integer Fused Multiply-Add Instructions
+	avx512pf                // AVX-512 Prefetch Instructions
+	avx512er                // AVX-512 Exponential and Reciprocal Instructions
+	avx512cd                // AVX-512 Conflict Detection Instructions
+	avx512bw                // AVX-512 Byte and Word Instructions
+	avx512vl                // AVX-512 Vector Length Extensions
+	avx512vbmi              // AVX-512 Vector Bit Manipulation Instructions
+	mpx                     // Intel MPX (Memory Protection Extensions)
+	erms                    // Enhanced REP MOVSB/STOSB
+	rdtscp                  // RDTSCP Instruction
+	cx16                    // CMPXCHG16B Instruction
+	sgx                     // Software Guard Extensions
+
+	// Performance indicators
+	sse2slow // SSE2 is supported, but usually not faster
+	sse3slow // SSE3 is supported, but usually not faster
+	atom     // Atom processor, some SSSE3 instructions are slower
+)
+
+var flagNames = map[flags]string{
+	cmov:        "CMOV",        // i686 CMOV
+	nx:          "NX",          // NX (No-Execute) bit
+	amd3dnow:    "AMD3DNOW",    // AMD 3DNOW
+	amd3dnowext: "AMD3DNOWEXT", // AMD 3DNowExt
+	mmx:         "MMX",         // Standard MMX
+	mmxext:      "MMXEXT",      // SSE integer functions or AMD MMX ext
+	sse:         "SSE",         // SSE functions
+	sse2:        "SSE2",        // P4 SSE2 functions
+	sse3:        "SSE3",        // Prescott SSE3 functions
+	ssse3:       "SSSE3",       // Conroe SSSE3 functions
+	sse4:        "SSE4.1",      // Penryn SSE4.1 functions
+	sse4a:       "SSE4A",       // AMD Barcelona microarchitecture SSE4a instructions
+	sse42:       "SSE4.2",      // Nehalem SSE4.2 functions
+	avx:         "AVX",         // AVX functions
+	avx2:        "AVX2",        // AVX functions
+	fma3:        "FMA3",        // Intel FMA 3
+	fma4:        "FMA4",        // Bulldozer FMA4 functions
+	xop:         "XOP",         // Bulldozer XOP functions
+	f16c:        "F16C",        // Half-precision floating-point conversion
+	bmi1:        "BMI1",        // Bit Manipulation Instruction Set 1
+	bmi2:        "BMI2",        // Bit Manipulation Instruction Set 2
+	tbm:         "TBM",         // AMD Trailing Bit Manipulation
+	lzcnt:       "LZCNT",       // LZCNT instruction
+	popcnt:      "POPCNT",      // POPCNT instruction
+	aesni:       "AESNI",       // Advanced Encryption Standard New Instructions
+	clmul:       "CLMUL",       // Carry-less Multiplication
+	htt:         "HTT",         // Hyperthreading (enabled)
+	hle:         "HLE",         // Hardware Lock Elision
+	rtm:         "RTM",         // Restricted Transactional Memory
+	rdrand:      "RDRAND",      // RDRAND instruction is available
+	rdseed:      "RDSEED",      // RDSEED instruction is available
+	adx:         "ADX",         // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+	sha:         "SHA",         // Intel SHA Extensions
+	avx512f:     "AVX512F",     // AVX-512 Foundation
+	avx512dq:    "AVX512DQ",    // AVX-512 Doubleword and Quadword Instructions
+	avx512ifma:  "AVX512IFMA",  // AVX-512 Integer Fused Multiply-Add Instructions
+	avx512pf:    "AVX512PF",    // AVX-512 Prefetch Instructions
+	avx512er:    "AVX512ER",    // AVX-512 Exponential and Reciprocal Instructions
+	avx512cd:    "AVX512CD",    // AVX-512 Conflict Detection Instructions
+	avx512bw:    "AVX512BW",    // AVX-512 Byte and Word Instructions
+	avx512vl:    "AVX512VL",    // AVX-512 Vector Length Extensions
+	avx512vbmi:  "AVX512VBMI",  // AVX-512 Vector Bit Manipulation Instructions
+	mpx:         "MPX",         // Intel MPX (Memory Protection Extensions)
+	erms:        "ERMS",        // Enhanced REP MOVSB/STOSB
+	rdtscp:      "RDTSCP",      // RDTSCP Instruction
+	cx16:        "CX16",        // CMPXCHG16B Instruction
+	sgx:         "SGX",         // Software Guard Extensions
+
+	// Performance indicators
+	sse2slow: "SSE2SLOW", // SSE2 supported, but usually not faster
+	sse3slow: "SSE3SLOW", // SSE3 supported, but usually not faster
+	atom:     "ATOM",     // Atom processor, some SSSE3 instructions are slower
+
+}
+
+// CPUInfo contains information about the detected system CPU.
+type cpuInfo struct {
+	brandname      string // Brand name reported by the CPU
+	vendorid       vendor // Comparable CPU vendor ID
+	features       flags  // Features of the CPU
+	physicalcores  int    // Number of physical processor cores in your CPU. Will be 0 if undetectable.
+	threadspercore int    // Number of threads per physical core. Will be 1 if undetectable.
+	logicalcores   int    // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
+	family         int    // CPU family number
+	model          int    // CPU model number
+	cacheline      int    // Cache line size in bytes. Will be 0 if undetectable.
+	cache          struct {
+		l1i int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
+		l1d int // L1 Data Cache (per core or shared). Will be -1 if undetected
+		l2  int // L2 Cache (per core or shared). Will be -1 if undetected
+		l3  int // L3 Instruction Cache (per core or shared). Will be -1 if undetected
+	}
+	sgx       sgxsupport
+	maxFunc   uint32
+	maxExFunc uint32
+}
+
+var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
+var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+var xgetbv func(index uint32) (eax, edx uint32)
+var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
+
+// CPU contains information about the CPU as detected on startup,
+// or when Detect last was called.
+//
+// Use this as the primary entry point to you data,
+// this way queries are
+var cpu cpuInfo
+
+func init() {
+	initCPU()
+	detect()
+}
+
+// Detect will re-detect current CPU info.
+// This will replace the content of the exported CPU variable.
+//
+// Unless you expect the CPU to change while you are running your program
+// you should not need to call this function.
+// If you call this, you must ensure that no other goroutine is accessing the
+// exported CPU variable.
+func detect() {
+	cpu.maxFunc = maxFunctionID()
+	cpu.maxExFunc = maxExtendedFunction()
+	cpu.brandname = brandName()
+	cpu.cacheline = cacheLine()
+	cpu.family, cpu.model = familyModel()
+	cpu.features = support()
+	cpu.sgx = hasSGX(cpu.features&sgx != 0)
+	cpu.threadspercore = threadsPerCore()
+	cpu.logicalcores = logicalCores()
+	cpu.physicalcores = physicalCores()
+	cpu.vendorid = vendorID()
+	cpu.cacheSize()
+}
+
+// Generated here: http://play.golang.org/p/BxFH2Gdc0G
+
+// Cmov indicates support of CMOV instructions
+func (c cpuInfo) cmov() bool {
+	return c.features&cmov != 0
+}
+
+// Amd3dnow indicates support of AMD 3DNOW! instructions
+func (c cpuInfo) amd3dnow() bool {
+	return c.features&amd3dnow != 0
+}
+
+// Amd3dnowExt indicates support of AMD 3DNOW! Extended instructions
+func (c cpuInfo) amd3dnowext() bool {
+	return c.features&amd3dnowext != 0
+}
+
+// MMX indicates support of MMX instructions
+func (c cpuInfo) mmx() bool {
+	return c.features&mmx != 0
+}
+
+// MMXExt indicates support of MMXEXT instructions
+// (SSE integer functions or AMD MMX ext)
+func (c cpuInfo) mmxext() bool {
+	return c.features&mmxext != 0
+}
+
+// SSE indicates support of SSE instructions
+func (c cpuInfo) sse() bool {
+	return c.features&sse != 0
+}
+
+// SSE2 indicates support of SSE 2 instructions
+func (c cpuInfo) sse2() bool {
+	return c.features&sse2 != 0
+}
+
+// SSE3 indicates support of SSE 3 instructions
+func (c cpuInfo) sse3() bool {
+	return c.features&sse3 != 0
+}
+
+// SSSE3 indicates support of SSSE 3 instructions
+func (c cpuInfo) ssse3() bool {
+	return c.features&ssse3 != 0
+}
+
+// SSE4 indicates support of SSE 4 (also called SSE 4.1) instructions
+func (c cpuInfo) sse4() bool {
+	return c.features&sse4 != 0
+}
+
+// SSE42 indicates support of SSE4.2 instructions
+func (c cpuInfo) sse42() bool {
+	return c.features&sse42 != 0
+}
+
+// AVX indicates support of AVX instructions
+// and operating system support of AVX instructions
+func (c cpuInfo) avx() bool {
+	return c.features&avx != 0
+}
+
+// AVX2 indicates support of AVX2 instructions
+func (c cpuInfo) avx2() bool {
+	return c.features&avx2 != 0
+}
+
+// FMA3 indicates support of FMA3 instructions
+func (c cpuInfo) fma3() bool {
+	return c.features&fma3 != 0
+}
+
+// FMA4 indicates support of FMA4 instructions
+func (c cpuInfo) fma4() bool {
+	return c.features&fma4 != 0
+}
+
+// XOP indicates support of XOP instructions
+func (c cpuInfo) xop() bool {
+	return c.features&xop != 0
+}
+
+// F16C indicates support of F16C instructions
+func (c cpuInfo) f16c() bool {
+	return c.features&f16c != 0
+}
+
+// BMI1 indicates support of BMI1 instructions
+func (c cpuInfo) bmi1() bool {
+	return c.features&bmi1 != 0
+}
+
+// BMI2 indicates support of BMI2 instructions
+func (c cpuInfo) bmi2() bool {
+	return c.features&bmi2 != 0
+}
+
+// TBM indicates support of TBM instructions
+// (AMD Trailing Bit Manipulation)
+func (c cpuInfo) tbm() bool {
+	return c.features&tbm != 0
+}
+
+// Lzcnt indicates support of LZCNT instruction
+func (c cpuInfo) lzcnt() bool {
+	return c.features&lzcnt != 0
+}
+
+// Popcnt indicates support of POPCNT instruction
+func (c cpuInfo) popcnt() bool {
+	return c.features&popcnt != 0
+}
+
+// HTT indicates the processor has Hyperthreading enabled
+func (c cpuInfo) htt() bool {
+	return c.features&htt != 0
+}
+
+// SSE2Slow indicates that SSE2 may be slow on this processor
+func (c cpuInfo) sse2slow() bool {
+	return c.features&sse2slow != 0
+}
+
+// SSE3Slow indicates that SSE3 may be slow on this processor
+func (c cpuInfo) sse3slow() bool {
+	return c.features&sse3slow != 0
+}
+
+// AesNi indicates support of AES-NI instructions
+// (Advanced Encryption Standard New Instructions)
+func (c cpuInfo) aesni() bool {
+	return c.features&aesni != 0
+}
+
+// Clmul indicates support of CLMUL instructions
+// (Carry-less Multiplication)
+func (c cpuInfo) clmul() bool {
+	return c.features&clmul != 0
+}
+
+// NX indicates support of NX (No-Execute) bit
+func (c cpuInfo) nx() bool {
+	return c.features&nx != 0
+}
+
+// SSE4A indicates support of AMD Barcelona microarchitecture SSE4a instructions
+func (c cpuInfo) sse4a() bool {
+	return c.features&sse4a != 0
+}
+
+// HLE indicates support of Hardware Lock Elision
+func (c cpuInfo) hle() bool {
+	return c.features&hle != 0
+}
+
+// RTM indicates support of Restricted Transactional Memory
+func (c cpuInfo) rtm() bool {
+	return c.features&rtm != 0
+}
+
+// Rdrand indicates support of RDRAND instruction is available
+func (c cpuInfo) rdrand() bool {
+	return c.features&rdrand != 0
+}
+
+// Rdseed indicates support of RDSEED instruction is available
+func (c cpuInfo) rdseed() bool {
+	return c.features&rdseed != 0
+}
+
+// ADX indicates support of Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
+func (c cpuInfo) adx() bool {
+	return c.features&adx != 0
+}
+
+// SHA indicates support of Intel SHA Extensions
+func (c cpuInfo) sha() bool {
+	return c.features&sha != 0
+}
+
+// AVX512F indicates support of AVX-512 Foundation
+func (c cpuInfo) avx512f() bool {
+	return c.features&avx512f != 0
+}
+
+// AVX512DQ indicates support of AVX-512 Doubleword and Quadword Instructions
+func (c cpuInfo) avx512dq() bool {
+	return c.features&avx512dq != 0
+}
+
+// AVX512IFMA indicates support of AVX-512 Integer Fused Multiply-Add Instructions
+func (c cpuInfo) avx512ifma() bool {
+	return c.features&avx512ifma != 0
+}
+
+// AVX512PF indicates support of AVX-512 Prefetch Instructions
+func (c cpuInfo) avx512pf() bool {
+	return c.features&avx512pf != 0
+}
+
+// AVX512ER indicates support of AVX-512 Exponential and Reciprocal Instructions
+func (c cpuInfo) avx512er() bool {
+	return c.features&avx512er != 0
+}
+
+// AVX512CD indicates support of AVX-512 Conflict Detection Instructions
+func (c cpuInfo) avx512cd() bool {
+	return c.features&avx512cd != 0
+}
+
+// AVX512BW indicates support of AVX-512 Byte and Word Instructions
+func (c cpuInfo) avx512bw() bool {
+	return c.features&avx512bw != 0
+}
+
+// AVX512VL indicates support of AVX-512 Vector Length Extensions
+func (c cpuInfo) avx512vl() bool {
+	return c.features&avx512vl != 0
+}
+
+// AVX512VBMI indicates support of AVX-512 Vector Bit Manipulation Instructions
+func (c cpuInfo) avx512vbmi() bool {
+	return c.features&avx512vbmi != 0
+}
+
+// MPX indicates support of Intel MPX (Memory Protection Extensions)
+func (c cpuInfo) mpx() bool {
+	return c.features&mpx != 0
+}
+
+// ERMS indicates support of Enhanced REP MOVSB/STOSB
+func (c cpuInfo) erms() bool {
+	return c.features&erms != 0
+}
+
+// RDTSCP Instruction is available.
+func (c cpuInfo) rdtscp() bool {
+	return c.features&rdtscp != 0
+}
+
+// CX16 indicates if CMPXCHG16B instruction is available.
+func (c cpuInfo) cx16() bool {
+	return c.features&cx16 != 0
+}
+
+// TSX is split into HLE (Hardware Lock Elision) and RTM (Restricted Transactional Memory) detection.
+// So TSX simply checks that.
+func (c cpuInfo) tsx() bool {
+	return c.features&(mpx|rtm) == mpx|rtm
+}
+
+// Atom indicates an Atom processor
+func (c cpuInfo) atom() bool {
+	return c.features&atom != 0
+}
+
+// Intel returns true if vendor is recognized as Intel
+func (c cpuInfo) intel() bool {
+	return c.vendorid == intel
+}
+
+// AMD returns true if vendor is recognized as AMD
+func (c cpuInfo) amd() bool {
+	return c.vendorid == amd
+}
+
+// Transmeta returns true if vendor is recognized as Transmeta
+func (c cpuInfo) transmeta() bool {
+	return c.vendorid == transmeta
+}
+
+// NSC returns true if vendor is recognized as National Semiconductor
+func (c cpuInfo) nsc() bool {
+	return c.vendorid == nsc
+}
+
+// VIA returns true if vendor is recognized as VIA
+func (c cpuInfo) via() bool {
+	return c.vendorid == via
+}
+
+// RTCounter returns the 64-bit time-stamp counter
+// Uses the RDTSCP instruction. The value 0 is returned
+// if the CPU does not support the instruction.
+func (c cpuInfo) rtcounter() uint64 {
+	if !c.rdtscp() {
+		return 0
+	}
+	a, _, _, d := rdtscpAsm()
+	return uint64(a) | (uint64(d) << 32)
+}
+
+// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
+// This variable is OS dependent, but on Linux contains information
+// about the current cpu/core the code is running on.
+// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
+func (c cpuInfo) ia32tscaux() uint32 {
+	if !c.rdtscp() {
+		return 0
+	}
+	_, _, ecx, _ := rdtscpAsm()
+	return ecx
+}
+
+// LogicalCPU will return the Logical CPU the code is currently executing on.
+// This is likely to change when the OS re-schedules the running thread
+// to another CPU.
+// If the current core cannot be detected, -1 will be returned.
+func (c cpuInfo) logicalcpu() int {
+	if c.maxFunc < 1 {
+		return -1
+	}
+	_, ebx, _, _ := cpuid(1)
+	return int(ebx >> 24)
+}
+
+// VM Will return true if the cpu id indicates we are in
+// a virtual machine. This is only a hint, and will very likely
+// have many false negatives.
+func (c cpuInfo) vm() bool {
+	switch c.vendorid {
+	case msvm, kvm, vmware, xenhvm:
+		return true
+	}
+	return false
+}
+
+// Flags contains detected cpu features and caracteristics
+type flags uint64
+
+// String returns a string representation of the detected
+// CPU features.
+func (f flags) String() string {
+	return strings.Join(f.strings(), ",")
+}
+
+// Strings returns and array of the detected features.
+func (f flags) strings() []string {
+	s := support()
+	r := make([]string, 0, 20)
+	for i := uint(0); i < 64; i++ {
+		key := flags(1 << i)
+		val := flagNames[key]
+		if s&key != 0 {
+			r = append(r, val)
+		}
+	}
+	return r
+}
+
+func maxExtendedFunction() uint32 {
+	eax, _, _, _ := cpuid(0x80000000)
+	return eax
+}
+
+func maxFunctionID() uint32 {
+	a, _, _, _ := cpuid(0)
+	return a
+}
+
+func brandName() string {
+	if maxExtendedFunction() >= 0x80000004 {
+		v := make([]uint32, 0, 48)
+		for i := uint32(0); i < 3; i++ {
+			a, b, c, d := cpuid(0x80000002 + i)
+			v = append(v, a, b, c, d)
+		}
+		return strings.Trim(string(valAsString(v...)), " ")
+	}
+	return "unknown"
+}
+
+func threadsPerCore() int {
+	mfi := maxFunctionID()
+	if mfi < 0x4 || vendorID() != intel {
+		return 1
+	}
+
+	if mfi < 0xb {
+		_, b, _, d := cpuid(1)
+		if (d & (1 << 28)) != 0 {
+			// v will contain logical core count
+			v := (b >> 16) & 255
+			if v > 1 {
+				a4, _, _, _ := cpuid(4)
+				// physical cores
+				v2 := (a4 >> 26) + 1
+				if v2 > 0 {
+					return int(v) / int(v2)
+				}
+			}
+		}
+		return 1
+	}
+	_, b, _, _ := cpuidex(0xb, 0)
+	if b&0xffff == 0 {
+		return 1
+	}
+	return int(b & 0xffff)
+}
+
+func logicalCores() int {
+	mfi := maxFunctionID()
+	switch vendorID() {
+	case intel:
+		// Use this on old Intel processors
+		if mfi < 0xb {
+			if mfi < 1 {
+				return 0
+			}
+			// CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
+			// that can be assigned to logical processors in a physical package.
+			// The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
+			_, ebx, _, _ := cpuid(1)
+			logical := (ebx >> 16) & 0xff
+			return int(logical)
+		}
+		_, b, _, _ := cpuidex(0xb, 1)
+		return int(b & 0xffff)
+	case amd:
+		_, b, _, _ := cpuid(1)
+		return int((b >> 16) & 0xff)
+	default:
+		return 0
+	}
+}
+
+func familyModel() (int, int) {
+	if maxFunctionID() < 0x1 {
+		return 0, 0
+	}
+	eax, _, _, _ := cpuid(1)
+	family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff)
+	model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0)
+	return int(family), int(model)
+}
+
+func physicalCores() int {
+	switch vendorID() {
+	case intel:
+		return logicalCores() / threadsPerCore()
+	case amd:
+		if maxExtendedFunction() >= 0x80000008 {
+			_, _, c, _ := cpuid(0x80000008)
+			return int(c&0xff) + 1
+		}
+	}
+	return 0
+}
+
+// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
+var vendorMapping = map[string]vendor{
+	"AMDisbetter!": amd,
+	"AuthenticAMD": amd,
+	"CentaurHauls": via,
+	"GenuineIntel": intel,
+	"TransmetaCPU": transmeta,
+	"GenuineTMx86": transmeta,
+	"Geode by NSC": nsc,
+	"VIA VIA VIA ": via,
+	"KVMKVMKVMKVM": kvm,
+	"Microsoft Hv": msvm,
+	"VMwareVMware": vmware,
+	"XenVMMXenVMM": xenhvm,
+}
+
+func vendorID() vendor {
+	_, b, c, d := cpuid(0)
+	v := valAsString(b, d, c)
+	vend, ok := vendorMapping[string(v)]
+	if !ok {
+		return other
+	}
+	return vend
+}
+
+func cacheLine() int {
+	if maxFunctionID() < 0x1 {
+		return 0
+	}
+
+	_, ebx, _, _ := cpuid(1)
+	cache := (ebx & 0xff00) >> 5 // cflush size
+	if cache == 0 && maxExtendedFunction() >= 0x80000006 {
+		_, _, ecx, _ := cpuid(0x80000006)
+		cache = ecx & 0xff // cacheline size
+	}
+	// TODO: Read from Cache and TLB Information
+	return int(cache)
+}
+
+func (c *cpuInfo) cacheSize() {
+	c.cache.l1d = -1
+	c.cache.l1i = -1
+	c.cache.l2 = -1
+	c.cache.l3 = -1
+	vendor := vendorID()
+	switch vendor {
+	case intel:
+		if maxFunctionID() < 4 {
+			return
+		}
+		for i := uint32(0); ; i++ {
+			eax, ebx, ecx, _ := cpuidex(4, i)
+			cacheType := eax & 15
+			if cacheType == 0 {
+				break
+			}
+			cacheLevel := (eax >> 5) & 7
+			coherency := int(ebx&0xfff) + 1
+			partitions := int((ebx>>12)&0x3ff) + 1
+			associativity := int((ebx>>22)&0x3ff) + 1
+			sets := int(ecx) + 1
+			size := associativity * partitions * coherency * sets
+			switch cacheLevel {
+			case 1:
+				if cacheType == 1 {
+					// 1 = Data Cache
+					c.cache.l1d = size
+				} else if cacheType == 2 {
+					// 2 = Instruction Cache
+					c.cache.l1i = size
+				} else {
+					if c.cache.l1d < 0 {
+						c.cache.l1i = size
+					}
+					if c.cache.l1i < 0 {
+						c.cache.l1i = size
+					}
+				}
+			case 2:
+				c.cache.l2 = size
+			case 3:
+				c.cache.l3 = size
+			}
+		}
+	case amd:
+		// Untested.
+		if maxExtendedFunction() < 0x80000005 {
+			return
+		}
+		_, _, ecx, edx := cpuid(0x80000005)
+		c.cache.l1d = int(((ecx >> 24) & 0xFF) * 1024)
+		c.cache.l1i = int(((edx >> 24) & 0xFF) * 1024)
+
+		if maxExtendedFunction() < 0x80000006 {
+			return
+		}
+		_, _, ecx, _ = cpuid(0x80000006)
+		c.cache.l2 = int(((ecx >> 16) & 0xFFFF) * 1024)
+	}
+
+	return
+}
+
+type sgxsupport struct {
+	available           bool
+	sgx1supported       bool
+	sgx2supported       bool
+	maxenclavesizenot64 int64
+	maxenclavesize64    int64
+}
+
+func hasSGX(available bool) (rval sgxsupport) {
+	rval.available = available
+
+	if !available {
+		return
+	}
+
+	a, _, _, d := cpuidex(0x12, 0)
+	rval.sgx1supported = a&0x01 != 0
+	rval.sgx2supported = a&0x02 != 0
+	rval.maxenclavesizenot64 = 1 << (d & 0xFF)     // pow 2
+	rval.maxenclavesize64 = 1 << ((d >> 8) & 0xFF) // pow 2
+
+	return
+}
+
+func support() flags {
+	mfi := maxFunctionID()
+	vend := vendorID()
+	if mfi < 0x1 {
+		return 0
+	}
+	rval := uint64(0)
+	_, _, c, d := cpuid(1)
+	if (d & (1 << 15)) != 0 {
+		rval |= cmov
+	}
+	if (d & (1 << 23)) != 0 {
+		rval |= mmx
+	}
+	if (d & (1 << 25)) != 0 {
+		rval |= mmxext
+	}
+	if (d & (1 << 25)) != 0 {
+		rval |= sse
+	}
+	if (d & (1 << 26)) != 0 {
+		rval |= sse2
+	}
+	if (c & 1) != 0 {
+		rval |= sse3
+	}
+	if (c & 0x00000200) != 0 {
+		rval |= ssse3
+	}
+	if (c & 0x00080000) != 0 {
+		rval |= sse4
+	}
+	if (c & 0x00100000) != 0 {
+		rval |= sse42
+	}
+	if (c & (1 << 25)) != 0 {
+		rval |= aesni
+	}
+	if (c & (1 << 1)) != 0 {
+		rval |= clmul
+	}
+	if c&(1<<23) != 0 {
+		rval |= popcnt
+	}
+	if c&(1<<30) != 0 {
+		rval |= rdrand
+	}
+	if c&(1<<29) != 0 {
+		rval |= f16c
+	}
+	if c&(1<<13) != 0 {
+		rval |= cx16
+	}
+	if vend == intel && (d&(1<<28)) != 0 && mfi >= 4 {
+		if threadsPerCore() > 1 {
+			rval |= htt
+		}
+	}
+
+	// Check XGETBV, OXSAVE and AVX bits
+	if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
+		// Check for OS support
+		eax, _ := xgetbv(0)
+		if (eax & 0x6) == 0x6 {
+			rval |= avx
+			if (c & 0x00001000) != 0 {
+				rval |= fma3
+			}
+		}
+	}
+
+	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
+	if mfi >= 7 {
+		_, ebx, ecx, _ := cpuidex(7, 0)
+		if (rval&avx) != 0 && (ebx&0x00000020) != 0 {
+			rval |= avx2
+		}
+		if (ebx & 0x00000008) != 0 {
+			rval |= bmi1
+			if (ebx & 0x00000100) != 0 {
+				rval |= bmi2
+			}
+		}
+		if ebx&(1<<2) != 0 {
+			rval |= sgx
+		}
+		if ebx&(1<<4) != 0 {
+			rval |= hle
+		}
+		if ebx&(1<<9) != 0 {
+			rval |= erms
+		}
+		if ebx&(1<<11) != 0 {
+			rval |= rtm
+		}
+		if ebx&(1<<14) != 0 {
+			rval |= mpx
+		}
+		if ebx&(1<<18) != 0 {
+			rval |= rdseed
+		}
+		if ebx&(1<<19) != 0 {
+			rval |= adx
+		}
+		if ebx&(1<<29) != 0 {
+			rval |= sha
+		}
+
+		// Only detect AVX-512 features if XGETBV is supported
+		if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
+			// Check for OS support
+			eax, _ := xgetbv(0)
+
+			// Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+			// ZMM16-ZMM31 state are enabled by OS)
+			/// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
+			if (eax>>5)&7 == 7 && (eax>>1)&3 == 3 {
+				if ebx&(1<<16) != 0 {
+					rval |= avx512f
+				}
+				if ebx&(1<<17) != 0 {
+					rval |= avx512dq
+				}
+				if ebx&(1<<21) != 0 {
+					rval |= avx512ifma
+				}
+				if ebx&(1<<26) != 0 {
+					rval |= avx512pf
+				}
+				if ebx&(1<<27) != 0 {
+					rval |= avx512er
+				}
+				if ebx&(1<<28) != 0 {
+					rval |= avx512cd
+				}
+				if ebx&(1<<30) != 0 {
+					rval |= avx512bw
+				}
+				if ebx&(1<<31) != 0 {
+					rval |= avx512vl
+				}
+				// ecx
+				if ecx&(1<<1) != 0 {
+					rval |= avx512vbmi
+				}
+			}
+		}
+	}
+
+	if maxExtendedFunction() >= 0x80000001 {
+		_, _, c, d := cpuid(0x80000001)
+		if (c & (1 << 5)) != 0 {
+			rval |= lzcnt
+			rval |= popcnt
+		}
+		if (d & (1 << 31)) != 0 {
+			rval |= amd3dnow
+		}
+		if (d & (1 << 30)) != 0 {
+			rval |= amd3dnowext
+		}
+		if (d & (1 << 23)) != 0 {
+			rval |= mmx
+		}
+		if (d & (1 << 22)) != 0 {
+			rval |= mmxext
+		}
+		if (c & (1 << 6)) != 0 {
+			rval |= sse4a
+		}
+		if d&(1<<20) != 0 {
+			rval |= nx
+		}
+		if d&(1<<27) != 0 {
+			rval |= rdtscp
+		}
+
+		/* Allow for selectively disabling SSE2 functions on AMD processors
+		   with SSE2 support but not SSE4a. This includes Athlon64, some
+		   Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
+		   than SSE2 often enough to utilize this special-case flag.
+		   AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
+		   so that SSE2 is used unless explicitly disabled by checking
+		   AV_CPU_FLAG_SSE2SLOW. */
+		if vendorID() != intel &&
+			rval&sse2 != 0 && (c&0x00000040) == 0 {
+			rval |= sse2slow
+		}
+
+		/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
+		 * used unless the OS has AVX support. */
+		if (rval & avx) != 0 {
+			if (c & 0x00000800) != 0 {
+				rval |= xop
+			}
+			if (c & 0x00010000) != 0 {
+				rval |= fma4
+			}
+		}
+
+		if vendorID() == intel {
+			family, model := familyModel()
+			if family == 6 && (model == 9 || model == 13 || model == 14) {
+				/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
+				 * 6/14 (core1 "yonah") theoretically support sse2, but it's
+				 * usually slower than mmx. */
+				if (rval & sse2) != 0 {
+					rval |= sse2slow
+				}
+				if (rval & sse3) != 0 {
+					rval |= sse3slow
+				}
+			}
+			/* The Atom processor has SSSE3 support, which is useful in many cases,
+			 * but sometimes the SSSE3 version is slower than the SSE2 equivalent
+			 * on the Atom, but is generally faster on other processors supporting
+			 * SSSE3. This flag allows for selectively disabling certain SSSE3
+			 * functions on the Atom. */
+			if family == 6 && model == 28 {
+				rval |= atom
+			}
+		}
+	}
+	return flags(rval)
+}
+
+func valAsString(values ...uint32) []byte {
+	r := make([]byte, 4*len(values))
+	for i, v := range values {
+		dst := r[i*4:]
+		dst[0] = byte(v & 0xff)
+		dst[1] = byte((v >> 8) & 0xff)
+		dst[2] = byte((v >> 16) & 0xff)
+		dst[3] = byte((v >> 24) & 0xff)
+		switch {
+		case dst[0] == 0:
+			return r[:i*4]
+		case dst[1] == 0:
+			return r[:i*4+1]
+		case dst[2] == 0:
+			return r[:i*4+2]
+		case dst[3] == 0:
+			return r[:i*4+3]
+		}
+	}
+	return r
+}
--- a/vendor/github.com/klauspost/cpuid/private/cpuid_386.s
+++ b/vendor/github.com/klauspost/cpuid/private/cpuid_386.s
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORL CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+4(FP)
+	MOVL BX, ebx+8(FP)
+	MOVL CX, ecx+12(FP)
+	MOVL DX, edx+16(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+4(FP)
+	MOVL DX, edx+8(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/private/cpuid_amd64.s
+++ b/vendor/github.com/klauspost/cpuid/private/cpuid_amd64.s
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+//+build amd64,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORQ CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmXgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/private/cpuid_detect_intel.go
+++ b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_intel.go
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo amd64,!gccgo
+
+package cpuid
+
+func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func asmXgetbv(index uint32) (eax, edx uint32)
+func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+
+func initCPU() {
+	cpuid = asmCpuid
+	cpuidex = asmCpuidex
+	xgetbv = asmXgetbv
+	rdtscpAsm = asmRdtscpAsm
+}
--- a/vendor/github.com/klauspost/cpuid/private/cpuid_detect_ref.go
+++ b/vendor/github.com/klauspost/cpuid/private/cpuid_detect_ref.go
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build !amd64,!386 gccgo
+
+package cpuid
+
+func initCPU() {
+	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	xgetbv = func(index uint32) (eax, edx uint32) {
+		return 0, 0
+	}
+
+	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+}
--- a/vendor/manifest
+++ b/vendor/manifest
@@ -117,6 +117,14 @@
 			"path": "/basic",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/klauspost/cpuid",
+			"repository": "https://github.com/klauspost/cpuid",
+			"vcs": "git",
+			"revision": "ae832f27941af41db13bd6d8efd2493e3b22415a",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/lucas-clemente/aes12",
 			"repository": "https://github.com/lucas-clemente/aes12",