gcc/libgo/go/regexp/syntax/regexp.go
Ian Lance Taylor f8d9fa9e80 libgo, compiler: Upgrade libgo to Go 1.4, except for runtime.
This upgrades all of libgo other than the runtime package to
the Go 1.4 release.  In Go 1.4 much of the runtime was
rewritten into Go.  Merging that code will take more time and
will not change the API, so I'm putting it off for now.

There are a few runtime changes anyhow, to accomodate other
packages that rely on minor modifications to the runtime
support.

The compiler changes slightly to add a one-bit flag to each
type descriptor kind that is stored directly in an interface,
which for gccgo is currently only pointer types.  Another
one-bit flag (gcprog) is reserved because it is used by the gc
compiler, but gccgo does not currently use it.

There is another error check in the compiler since I ran
across it during testing.

gotools/:
	* Makefile.am (go_cmd_go_files): Sort entries.  Add generate.go.
	* Makefile.in: Rebuild.

From-SVN: r219627
2015-01-15 00:27:56 +00:00

319 lines
7.6 KiB
Go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"bytes"
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character except newline
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// Equal returns true if x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
if len(x.Rune) != len(y.Rune) {
return false
}
for i, r := range x.Rune {
if r != y.Rune[i] {
return false
}
}
case OpAlternate, OpConcat:
if len(x.Sub) != len(y.Sub) {
return false
}
for i, sub := range x.Sub {
if !sub.Equal(y.Sub[i]) {
return false
}
}
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *bytes.Buffer, re *Regexp) {
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
if re.Flags&FoldCase != 0 {
b.WriteString(`(?i:`)
}
for _, r := range re.Rune {
escape(b, r, false)
}
if re.Flags&FoldCase != 0 {
b.WriteString(`)`)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
b.WriteRune('-')
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL:
b.WriteString(`(?-s:.)`)
case OpAnyChar:
b.WriteString(`(?s:.)`)
case OpBeginLine:
b.WriteRune('^')
case OpEndLine:
b.WriteRune('$')
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`(?-m:$)`)
} else {
b.WriteString(`\z`)
}
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0])
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
if re.Flags&NonGreedy != 0 {
b.WriteRune('?')
}
case OpConcat:
for _, sub := range re.Sub {
if sub.Op == OpAlternate {
b.WriteString(`(?:`)
writeRegexp(b, sub)
b.WriteString(`)`)
} else {
writeRegexp(b, sub)
}
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub)
}
}
}
func (re *Regexp) String() string {
var b bytes.Buffer
writeRegexp(&b, re)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *bytes.Buffer, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.IndexRune(meta, r) >= 0 || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.FormatInt(int64(r), 16))
b.WriteString(`}`)
}
}
// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
m := 0
if re.Op == OpCapture {
m = re.Cap
}
for _, sub := range re.Sub {
if n := sub.MaxCap(); m < n {
m = n
}
}
return m
}
// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
names := make([]string, re.MaxCap()+1)
re.capNames(names)
return names
}
func (re *Regexp) capNames(names []string) {
if re.Op == OpCapture {
names[re.Cap] = re.Name
}
for _, sub := range re.Sub {
sub.capNames(names)
}
}