package filters import ( "fmt" "unicode" "unicode/utf8" ) const ( tokenEOF = -(iota + 1) tokenQuoted tokenValue tokenField tokenSeparator tokenOperator tokenIllegal ) type token rune func (t token) String() string { switch t { case tokenEOF: return "EOF" case tokenQuoted: return "Quoted" case tokenValue: return "Value" case tokenField: return "Field" case tokenSeparator: return "Separator" case tokenOperator: return "Operator" case tokenIllegal: return "Illegal" } return string(t) } func (t token) GoString() string { return "token" + t.String() } type scanner struct { input string pos int ppos int // bounds the current rune in the string value bool } func (s *scanner) init(input string) { s.input = input s.pos = 0 s.ppos = 0 } func (s *scanner) next() rune { if s.pos >= len(s.input) { return tokenEOF } s.pos = s.ppos r, w := utf8.DecodeRuneInString(s.input[s.ppos:]) s.ppos += w if r == utf8.RuneError { if w > 0 { return tokenIllegal } return tokenEOF } if r == 0 { return tokenIllegal } return r } func (s *scanner) peek() rune { pos := s.pos ppos := s.ppos ch := s.next() s.pos = pos s.ppos = ppos return ch } func (s *scanner) scan() (nextp int, tk token, text string) { var ( ch = s.next() pos = s.pos ) chomp: switch { case ch == tokenEOF: case ch == tokenIllegal: case isQuoteRune(ch): s.scanQuoted(ch) return pos, tokenQuoted, s.input[pos:s.ppos] case isSeparatorRune(ch): s.value = false return pos, tokenSeparator, s.input[pos:s.ppos] case isOperatorRune(ch): s.scanOperator() s.value = true return pos, tokenOperator, s.input[pos:s.ppos] case unicode.IsSpace(ch): // chomp ch = s.next() pos = s.pos goto chomp case s.value: s.scanValue() s.value = false return pos, tokenValue, s.input[pos:s.ppos] case isFieldRune(ch): s.scanField() return pos, tokenField, s.input[pos:s.ppos] } return s.pos, token(ch), "" } func (s *scanner) scanField() { for { ch := s.peek() if !isFieldRune(ch) { break } s.next() } } func (s *scanner) scanOperator() { for { ch := s.peek() switch ch { case '=', '!', '~': s.next() default: return } } } func (s *scanner) scanValue() { for { ch := s.peek() if !isValueRune(ch) { break } s.next() } } func (s *scanner) scanQuoted(quote rune) { ch := s.next() // read character after quote for ch != quote { if ch == '\n' || ch < 0 { s.error("literal not terminated") return } if ch == '\\' { ch = s.scanEscape(quote) } else { ch = s.next() } } return } func (s *scanner) scanEscape(quote rune) rune { ch := s.next() // read character after '/' switch ch { case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: // nothing to do ch = s.next() case '0', '1', '2', '3', '4', '5', '6', '7': ch = s.scanDigits(ch, 8, 3) case 'x': ch = s.scanDigits(s.next(), 16, 2) case 'u': ch = s.scanDigits(s.next(), 16, 4) case 'U': ch = s.scanDigits(s.next(), 16, 8) default: s.error("illegal char escape") } return ch } func (s *scanner) scanDigits(ch rune, base, n int) rune { for n > 0 && digitVal(ch) < base { ch = s.next() n-- } if n > 0 { s.error("illegal char escape") } return ch } func (s *scanner) error(msg string) { fmt.Println("error fixme", msg) } func digitVal(ch rune) int { switch { case '0' <= ch && ch <= '9': return int(ch - '0') case 'a' <= ch && ch <= 'f': return int(ch - 'a' + 10) case 'A' <= ch && ch <= 'F': return int(ch - 'A' + 10) } return 16 // larger than any legal digit val } func isFieldRune(r rune) bool { return (r == '_' || isAlphaRune(r) || isDigitRune(r)) } func isAlphaRune(r rune) bool { return r >= 'A' && r <= 'Z' || r >= 'a' && r <= 'z' } func isDigitRune(r rune) bool { return r >= '0' && r <= '9' } func isOperatorRune(r rune) bool { switch r { case '=', '!', '~': return true } return false } func isQuoteRune(r rune) bool { switch r { case '/', '|', '"': // maybe add single quoting? return true } return false } func isSeparatorRune(r rune) bool { switch r { case ',', '.': return true } return false } func isValueRune(r rune) bool { return r != ',' && !unicode.IsSpace(r) && (unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsNumber(r) || unicode.IsGraphic(r) || unicode.IsPunct(r)) }