#=======================================================================
#
#   Python Lexical Analyser
#
#   Traditional Regular Expression Syntax
#
#=======================================================================

from Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char
from Errors import PlexError

class RegexpSyntaxError(PlexError):
  pass
  
def re(s):
  """
  Convert traditional string representation of regular expression |s|
  into Plex representation.
  """
  return REParser(s).parse_re()

class REParser(object):

  def __init__(self, s):
    self.s = s
    self.i = -1
    self.end = 0
    self.next()
  
  def parse_re(self):
    re = self.parse_alt()
    if not self.end:
      self.error("Unexpected %s" % repr(self.c))
    return re
  
  def parse_alt(self):
    """Parse a set of alternative regexps."""
    re = self.parse_seq()
    if self.c == '|':
      re_list = [re]
      while self.c == '|':
        self.next()
        re_list.append(self.parse_seq())
      re = apply(Alt, tuple(re_list))
    return re
      
  def parse_seq(self):
    """Parse a sequence of regexps."""
    re_list = []
    while not self.end and not self.c in "|)":
      re_list.append(self.parse_mod())
    return apply(Seq, tuple(re_list))
  
  def parse_mod(self):
    """Parse a primitive regexp followed by *, +, ? modifiers."""
    re = self.parse_prim()
    while not self.end and self.c in "*+?":
      if self.c == '*':
        re = Rep(re)
      elif self.c == '+':
        re = Rep1(re)
      else: # self.c == '?'
        re = Opt(re)
      self.next()
    return re

  def parse_prim(self):
    """Parse a primitive regexp."""
    c = self.get()
    if c == '.':
      re = AnyBut("\n")
    elif c == '^':
      re = Bol
    elif c == '$':
      re = Eol
    elif c == '(':
      re = self.parse_alt()
      self.expect(')')
    elif c == '[':
      re = self.parse_charset()
      self.expect(']')
    else:
      if c == '\\':
        c = self.get()
      re = Char(c)
    return re
  
  def parse_charset(self):
    """Parse a charset. Does not include the surrounding []."""
    char_list = []
    invert = 0
    if self.c == '^':
      invert = 1
      self.next()
    if self.c == ']':
      char_list.append(']')
      self.next()
    while not self.end and self.c != ']':
      c1 = self.get()
      if self.c == '-' and self.lookahead(1) != ']':
        self.next()
        c2 = self.get()
        for a in xrange(ord(c1), ord(c2) + 1):
          char_list.append(chr(a))
      else:
        char_list.append(c1)
    chars = ''.join(char_list)
    if invert:
      return AnyBut(chars)
    else:
      return Any(chars)
  
  def next(self):
    """Advance to the next char."""
    s = self.s
    i = self.i = self.i + 1
    if i < len(s):
      self.c = s[i]
    else:
      self.c = ''
      self.end = 1
    
  def get(self):
    if self.end:
      self.error("Premature end of string")
    c = self.c
    self.next()
    return c
    
  def lookahead(self, n):
    """Look ahead n chars."""
    j = self.i + n
    if j < len(self.s):
      return self.s[j]
    else:
      return ''

  def expect(self, c):
    """
    Expect to find character |c| at current position.
    Raises an exception otherwise.
    """
    if self.c == c:
      self.next()
    else:
      self.error("Missing %s" % repr(c))
  
  def error(self, mess):
    """Raise exception to signal syntax error in regexp."""
    raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % (
      repr(self.s), self.i, mess))