Browse code

ProcessWord support UTF-8 modified PorcessWord to working normally for UTF-8 strings and added test cases

Signed-off-by: Daehyeok Mun <daehyeok@gmail.com>

Daehyeok Mun authored on 2015/10/19 12:55:53
Showing 3 changed files
... ...
@@ -9,13 +9,15 @@ package dockerfile
9 9
 import (
10 10
 	"fmt"
11 11
 	"strings"
12
+	"text/scanner"
12 13
 	"unicode"
13 14
 )
14 15
 
15 16
 type shellWord struct {
16
-	word string
17
-	envs []string
18
-	pos  int
17
+	word    string
18
+	scanner scanner.Scanner
19
+	envs    []string
20
+	pos     int
19 21
 }
20 22
 
21 23
 // ProcessWord will use the 'env' list of environment variables,
... ...
@@ -26,11 +28,12 @@ func ProcessWord(word string, env []string) (string, error) {
26 26
 		envs: env,
27 27
 		pos:  0,
28 28
 	}
29
+	sw.scanner.Init(strings.NewReader(word))
29 30
 	return sw.process()
30 31
 }
31 32
 
32 33
 func (sw *shellWord) process() (string, error) {
33
-	return sw.processStopOn('\000')
34
+	return sw.processStopOn(scanner.EOF)
34 35
 }
35 36
 
36 37
 // Process the word, starting at 'pos', and stop when we get to the
... ...
@@ -43,10 +46,11 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
43 43
 		'$':  sw.processDollar,
44 44
 	}
45 45
 
46
-	for sw.pos < len(sw.word) {
47
-		ch := sw.peek()
48
-		if stopChar != '\000' && ch == stopChar {
49
-			sw.next()
46
+	for sw.scanner.Peek() != scanner.EOF {
47
+		ch := sw.scanner.Peek()
48
+
49
+		if stopChar != scanner.EOF && ch == stopChar {
50
+			sw.scanner.Next()
50 51
 			break
51 52
 		}
52 53
 		if fn, ok := charFuncMapping[ch]; ok {
... ...
@@ -58,14 +62,19 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
58 58
 			result += tmp
59 59
 		} else {
60 60
 			// Not special, just add it to the result
61
-			ch = sw.next()
61
+			ch = sw.scanner.Next()
62
+
62 63
 			if ch == '\\' {
63 64
 				// '\' escapes, except end of line
64
-				ch = sw.next()
65
-				if ch == '\000' {
66
-					continue
65
+
66
+				ch = sw.scanner.Next()
67
+
68
+				if ch == scanner.EOF {
69
+					break
67 70
 				}
71
+
68 72
 			}
73
+
69 74
 			result += string(ch)
70 75
 		}
71 76
 	}
... ...
@@ -73,36 +82,21 @@ func (sw *shellWord) processStopOn(stopChar rune) (string, error) {
73 73
 	return result, nil
74 74
 }
75 75
 
76
-func (sw *shellWord) peek() rune {
77
-	if sw.pos == len(sw.word) {
78
-		return '\000'
79
-	}
80
-	return rune(sw.word[sw.pos])
81
-}
82
-
83
-func (sw *shellWord) next() rune {
84
-	if sw.pos == len(sw.word) {
85
-		return '\000'
86
-	}
87
-	ch := rune(sw.word[sw.pos])
88
-	sw.pos++
89
-	return ch
90
-}
91
-
92 76
 func (sw *shellWord) processSingleQuote() (string, error) {
93 77
 	// All chars between single quotes are taken as-is
94 78
 	// Note, you can't escape '
95 79
 	var result string
96 80
 
97
-	sw.next()
81
+	sw.scanner.Next()
98 82
 
99 83
 	for {
100
-		ch := sw.next()
101
-		if ch == '\000' || ch == '\'' {
84
+		ch := sw.scanner.Next()
85
+		if ch == '\'' || ch == scanner.EOF {
102 86
 			break
103 87
 		}
104 88
 		result += string(ch)
105 89
 	}
90
+
106 91
 	return result, nil
107 92
 }
108 93
 
... ...
@@ -111,12 +105,12 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
111 111
 	// But you can escape " with a \
112 112
 	var result string
113 113
 
114
-	sw.next()
114
+	sw.scanner.Next()
115 115
 
116
-	for sw.pos < len(sw.word) {
117
-		ch := sw.peek()
116
+	for sw.scanner.Peek() != scanner.EOF {
117
+		ch := sw.scanner.Peek()
118 118
 		if ch == '"' {
119
-			sw.next()
119
+			sw.scanner.Next()
120 120
 			break
121 121
 		}
122 122
 		if ch == '$' {
... ...
@@ -126,18 +120,18 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
126 126
 			}
127 127
 			result += tmp
128 128
 		} else {
129
-			ch = sw.next()
129
+			ch = sw.scanner.Next()
130 130
 			if ch == '\\' {
131
-				chNext := sw.peek()
131
+				chNext := sw.scanner.Peek()
132 132
 
133
-				if chNext == '\000' {
133
+				if chNext == scanner.EOF {
134 134
 					// Ignore \ at end of word
135 135
 					continue
136 136
 				}
137 137
 
138 138
 				if chNext == '"' || chNext == '$' {
139 139
 					// \" and \$ can be escaped, all other \'s are left as-is
140
-					ch = sw.next()
140
+					ch = sw.scanner.Next()
141 141
 				}
142 142
 			}
143 143
 			result += string(ch)
... ...
@@ -148,23 +142,23 @@ func (sw *shellWord) processDoubleQuote() (string, error) {
148 148
 }
149 149
 
150 150
 func (sw *shellWord) processDollar() (string, error) {
151
-	sw.next()
152
-	ch := sw.peek()
151
+	sw.scanner.Next()
152
+	ch := sw.scanner.Peek()
153 153
 	if ch == '{' {
154
-		sw.next()
154
+		sw.scanner.Next()
155 155
 		name := sw.processName()
156
-		ch = sw.peek()
156
+		ch = sw.scanner.Peek()
157 157
 		if ch == '}' {
158 158
 			// Normal ${xx} case
159
-			sw.next()
159
+			sw.scanner.Next()
160 160
 			return sw.getEnv(name), nil
161 161
 		}
162 162
 		if ch == ':' {
163 163
 			// Special ${xx:...} format processing
164 164
 			// Yes it allows for recursive $'s in the ... spot
165 165
 
166
-			sw.next() // skip over :
167
-			modifier := sw.next()
166
+			sw.scanner.Next() // skip over :
167
+			modifier := sw.scanner.Next()
168 168
 
169 169
 			word, err := sw.processStopOn('}')
170 170
 			if err != nil {
... ...
@@ -207,16 +201,16 @@ func (sw *shellWord) processName() string {
207 207
 	// If it starts with a numeric then just return $#
208 208
 	var name string
209 209
 
210
-	for sw.pos < len(sw.word) {
211
-		ch := sw.peek()
210
+	for sw.scanner.Peek() != scanner.EOF {
211
+		ch := sw.scanner.Peek()
212 212
 		if len(name) == 0 && unicode.IsDigit(ch) {
213
-			ch = sw.next()
213
+			ch = sw.scanner.Next()
214 214
 			return string(ch)
215 215
 		}
216 216
 		if !unicode.IsLetter(ch) && !unicode.IsDigit(ch) && ch != '_' {
217 217
 			break
218 218
 		}
219
-		ch = sw.next()
219
+		ch = sw.scanner.Next()
220 220
 		name += string(ch)
221 221
 	}
222 222
 
... ...
@@ -15,7 +15,7 @@ func TestShellParser(t *testing.T) {
15 15
 	defer file.Close()
16 16
 
17 17
 	scanner := bufio.NewScanner(file)
18
-	envs := []string{"PWD=/home", "SHELL=bash"}
18
+	envs := []string{"PWD=/home", "SHELL=bash", "KOREAN=한국어"}
19 19
 	for scanner.Scan() {
20 20
 		line := scanner.Text()
21 21
 
... ...
@@ -56,3 +56,57 @@ he${PWD:=000}xx          |     error
56 56
 he${PWD:+${PWD}:}xx      |     he/home:xx
57 57
 he${XXX:-\$PWD:}xx       |     he$PWD:xx
58 58
 he${XXX:-\${PWD}z}xx     |     he${PWDz}xx
59
+안녕하세요                 |     안녕하세요
60
+안'녕'하세요               |     안녕하세요
61
+안'녕하세요                |     안녕하세요
62
+안녕\'하세요               |     안녕'하세요
63
+안\\'녕하세요              |     안\녕하세요
64
+안녕\t하세요               |     안녕t하세요
65
+"안녕\t하세요"             |     안녕\t하세요
66
+'안녕\t하세요              |     안녕\t하세요
67
+안녕하세요\                |     안녕하세요
68
+안녕하세요\\               |     안녕하세요\
69
+"안녕하세요                |     안녕하세요
70
+"안녕하세요\"              |     안녕하세요"
71
+"안녕'하세요"              |     안녕'하세요
72
+'안녕하세요                |     안녕하세요
73
+'안녕하세요\'              |     안녕하세요\
74
+안녕$1x                    |     안녕x
75
+안녕$.x                    |     안녕$.x
76
+안녕$pwd.                  |     안녕.
77
+안녕$PWD                   |     안녕/home
78
+안녕\$PWD                  |     안녕$PWD
79
+안녕\\$PWD                 |     안녕\/home
80
+안녕\${}                   |     안녕${}
81
+안녕\${}xx                 |     안녕${}xx
82
+안녕${}                    |     안녕
83
+안녕${}xx                  |     안녕xx
84
+안녕${hi}                  |     안녕
85
+안녕${hi}xx                |     안녕xx
86
+안녕${PWD}                 |     안녕/home
87
+안녕${.}                   |     error
88
+안녕${XXX:-000}xx          |     안녕000xx
89
+안녕${PWD:-000}xx          |     안녕/homexx
90
+안녕${XXX:-$PWD}xx         |     안녕/homexx
91
+안녕${XXX:-${PWD:-yyy}}xx  |     안녕/homexx
92
+안녕${XXX:-${YYY:-yyy}}xx  |     안녕yyyxx
93
+안녕${XXX:YYY}             |     error
94
+안녕${XXX:+${PWD}}xx       |     안녕xx
95
+안녕${PWD:+${XXX}}xx       |     안녕xx
96
+안녕${PWD:+${SHELL}}xx     |     안녕bashxx
97
+안녕${XXX:+000}xx          |     안녕xx
98
+안녕${PWD:+000}xx          |     안녕000xx
99
+'안녕${XX}'                |     안녕${XX}
100
+"안녕${PWD}"               |     안녕/home
101
+"안녕'$PWD'"               |     안녕'/home'
102
+'"안녕"'                   |     "안녕"
103
+안녕\$PWD                  |     안녕$PWD
104
+"안녕\$PWD"                |     안녕$PWD
105
+'안녕\$PWD'                |     안녕\$PWD
106
+안녕${PWD                  |     error
107
+안녕${PWD:=000}xx          |     error
108
+안녕${PWD:+${PWD}:}xx      |     안녕/home:xx
109
+안녕${XXX:-\$PWD:}xx       |     안녕$PWD:xx
110
+안녕${XXX:-\${PWD}z}xx     |     안녕${PWDz}xx
111
+$KOREAN                    |     한국어
112
+안녕$KOREAN                |     안녕한국어