-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchars.go
382 lines (345 loc) · 9.58 KB
/
chars.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
/*
chars.go
-John Taylor
Jan-16-2022
Determine the end-of-line format, tabs, bom, and nul characters
https://github.com/jftuga/chars
*/
package chars
import (
"bufio"
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"unicode/utf8"
"github.com/jftuga/ellipsis"
"github.com/olekukonko/tablewriter"
)
const PgmName string = "chars"
const PgmDesc string = "Determine the end-of-line format, tabs, bom, and nul"
const PgmUrl string = "https://github.com/jftuga/chars"
const PgmVersion string = "2.4.1"
const BlockSize int = 4096
type SpecialChars struct {
Filename string `json:"filename"`
Crlf uint64 `json:"crlf"`
Lf uint64 `json:"lf"`
Tab uint64 `json:"tab"`
Bom8 uint64 `json:"bom8"`
Bom16 uint64 `json:"bom16"`
Nul uint64 `json:"nul"`
BytesRead uint64 `json:"bytesRead"`
Failure bool `json:"failure"`
}
type CharsError struct {
code int
err string
}
// isText - if 2% of the bytes are non-printable, consider the file to be binary
func isText(s []byte, n int) bool {
const binaryCutoff float32 = 0.02
if n < BlockSize {
s = s[0:n]
}
bin := 0
for i, c := range string(s) {
if i <= 3 {
continue // skip bom characters
}
if i+utf8.UTFMax > len(s) {
break // last char may be incomplete - ignore
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' && c != '\f' && c != '\r' && c != 0x00 {
bin += 1
}
}
amount := float32(bin) / float32(len(s))
if amount >= binaryCutoff {
return false
}
return true
}
// searchForSpecialChars - search for special chars by incrementally reading in chunks as to not consume too much memory
// use *bufio.Reader so that either a file or STDIN can be used
func searchForSpecialChars(filename string, rdr *bufio.Reader, examineBinary bool) (SpecialChars, CharsError) {
var (
bomUtf8 = [...]byte{0xef, 0xbb, 0xbf}
bomUtf16le = [...]byte{0xff, 0xfe}
bomUtf16be = [...]byte{0xfe, 0xff}
)
var bom8, bom16 uint64
var err error
var bom []byte
// check for a BOM
// https://en.wikipedia.org/wiki/Byte_order_mark
bom, err = rdr.Peek(2)
if err == nil {
if bytes.HasPrefix(bom, bomUtf16le[:2]) {
bom16++
} else if bytes.HasPrefix(bom[:2], bomUtf16be[:2]) {
bom16++
}
}
if bom16 == 0 {
bom, err = rdr.Peek(3)
if err == nil {
if bytes.HasPrefix(bom, bomUtf8[:3]) {
bom8++
}
}
}
// check if file contains binary data
var firstBlock []byte
firstBlock, err = rdr.Peek(1024)
if err != nil {
if err != io.EOF {
_, _ = fmt.Fprintf(os.Stderr, "error: %s\n", err)
}
}
if !examineBinary && !isText(firstBlock, 1024) {
return SpecialChars{}, CharsError{code: 2, err: fmt.Sprintf("skipping unwanted binary file: %s", filename)}
}
var tab, lf, crlf, nul, bytesRead uint64
last := byte(0)
buff := make([]byte, 0, BlockSize)
for {
n, err := rdr.Read(buff[:cap(buff)])
buff = buff[:n]
bytesRead += uint64(n)
if n == 0 {
if err == nil {
continue
}
if err == io.EOF {
break
}
return SpecialChars{}, CharsError{code: 1, err: err.Error()}
}
for _, b := range buff {
if b < ' ' {
if b == '\x00' {
nul++
} else if b == '\n' {
lf++
if last == '\r' {
crlf++
lf--
}
} else if b == '\t' {
tab++
}
}
last = b
}
if err != nil && err != io.EOF {
return SpecialChars{}, CharsError{code: 1, err: err.Error()}
}
}
sc := SpecialChars{Filename: filename,
Crlf: crlf, Lf: lf, Tab: tab, Bom8: bom8, Bom16: bom16, Nul: nul, BytesRead: bytesRead,
}
return sc, CharsError{code: 0, err: ""}
}
// sortByName - sorts a slice of SpecialChars by filename
/*func sortByName(entry []SpecialChars) {
sort.Slice(entry, func(i, j int) bool {
if strings.ToLower(entry[i].Filename) > strings.ToLower(entry[j].Filename) {
return false
}
return true
})
}*/
// OutputTextTable - display a text table with each filename and the number of special characters
func OutputTextTable(allStats []SpecialChars, maxLength int, wantTotals, wantCommas bool) error {
if len(allStats) == 0 {
return nil
}
// TODO: make this a cmd-line option...
// sortByName(allStats)
w := bufio.NewWriter(os.Stdout)
table := tablewriter.NewWriter(w)
table.SetHeader([]string{"filename", "crlf", "lf", "tab", "nul", "bom8", "bom16", "bytesRead"})
var name string
var crlf, lf, tab, nul, bom8, bom16, bytesRead uint64
for _, s := range allStats {
if maxLength == 0 {
name = s.Filename
} else {
name = ellipsis.Shorten(s.Filename, maxLength)
}
row := []string{name, strconv.FormatUint(s.Crlf, 10), strconv.FormatUint(s.Lf, 10),
strconv.FormatUint(s.Tab, 10), strconv.FormatUint(s.Nul, 10), strconv.FormatUint(s.Bom8, 10),
strconv.FormatUint(s.Bom16, 10), strconv.FormatUint(s.BytesRead, 10)}
if wantCommas {
row[1] = RenderInteger("#,###.", int64(s.Crlf))
row[2] = RenderInteger("#,###.", int64(s.Lf))
row[3] = RenderInteger("#,###.", int64(s.Tab))
row[4] = RenderInteger("#,###.", int64(s.Nul))
row[5] = RenderInteger("#,###.", int64(s.Bom8))
row[6] = RenderInteger("#,###.", int64(s.Bom16))
row[7] = RenderInteger("#,###.", int64(s.BytesRead))
}
if wantTotals {
crlf += s.Crlf
lf += s.Lf
tab += s.Tab
nul += s.Nul
bom8 += s.Bom8
bom16 += s.Bom16
bytesRead += s.BytesRead
}
table.Append(row)
}
if wantTotals {
totals := fmt.Sprintf("TOTALS: %d files", len(allStats))
row := []string{totals, strconv.FormatUint(crlf, 10), strconv.FormatUint(lf, 10),
strconv.FormatUint(tab, 10), strconv.FormatUint(nul, 10), strconv.FormatUint(bom8, 10),
strconv.FormatUint(bom16, 10), strconv.FormatUint(bytesRead, 10)}
if wantCommas {
row[1] = RenderInteger("#,###.", int64(crlf))
row[2] = RenderInteger("#,###.", int64(lf))
row[3] = RenderInteger("#,###.", int64(tab))
row[4] = RenderInteger("#,###.", int64(nul))
row[5] = RenderInteger("#,###.", int64(bom8))
row[6] = RenderInteger("#,###.", int64(bom16))
row[7] = RenderInteger("#,###.", int64(bytesRead))
}
table.Append(row)
}
table.Render()
return w.Flush()
}
// OutputFailedFileList - only display a list of file names that have failed when using -F cmd line option
func OutputFailedFileList(allStats []SpecialChars) {
if len(allStats) == 0 {
return
}
for _, stat := range allStats {
if stat.Failure {
_, _ = fmt.Println(stat.Filename)
}
}
}
// GetJSON - return results JSON format
func GetJSON(allStats []SpecialChars) string {
j, err := json.MarshalIndent(allStats, "", " ")
if err != nil {
log.Fatal(err)
}
if string(j) == "null" {
return ""
}
return string(j)
}
// ProcessGlob - process all files matching the file-glob
func ProcessGlob(globArg string, allStats *[]SpecialChars, examineBinary bool, excludeMatched *regexp.Regexp, fail string) uint64 {
var err error
anyCase := CaseInsensitive(globArg)
if len(globArg) > 0 && len(anyCase) == 0 {
anyCase = globArg
}
globFiles, err := filepath.Glob(anyCase)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "error: %s\n", err)
}
if len(globFiles) == 0 {
globFiles = []string{anyCase}
}
return ProcessFileList(globFiles, allStats, examineBinary, excludeMatched, fail)
}
// ProcessFileList - process a list of filenames
func ProcessFileList(globFiles []string, allStats *[]SpecialChars, examineBinary bool, excludeMatched *regexp.Regexp, fail string) uint64 {
var file *os.File
for _, filename := range globFiles {
info, err := os.Stat(filename)
if err != nil {
// invalid file, skip it
continue
}
if info.IsDir() {
// fmt.Println("skipping directory:", filename)
continue
}
if excludeMatched != nil {
if excludeMatched.Match([]byte(filename)) {
// fmt.Println("excluding file:", filename)
continue
}
}
// fmt.Println("checking file:", filename)
file, err = os.Open(filename)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "%s\n", err)
continue
}
reader := bufio.NewReader(file)
stats, charsErr := searchForSpecialChars(filename, reader, examineBinary)
_ = file.Close()
if charsErr.code == 1 {
// output error message except for an unwanted binary file
_, _ = fmt.Fprintf(os.Stderr, "error #%d: %s\n", charsErr.code, charsErr.err)
continue
} else if charsErr.code != 0 {
continue
}
*allStats = append(*allStats, stats)
}
if len(fail) > 0 {
return GetFailures(fail, allStats)
}
return 0
}
// ProcessStdin - read a file stream directly from STDIN
func ProcessStdin(allStats *[]SpecialChars, examineBinary bool, fail string) (uint64, CharsError) {
var charsErr CharsError
reader := bufio.NewReader(os.Stdin)
stats, charsErr := searchForSpecialChars("STDIN", reader, examineBinary)
if charsErr.code != 0 {
return 0, charsErr
}
*allStats = append(*allStats, stats)
if len(fail) > 0 {
return GetFailures(fail, allStats), charsErr
}
return 0, charsErr
}
// GetFailures - parse as comma-delimited list and return the number of characters in the given character list
func GetFailures(commaList string, allStats *[]SpecialChars) uint64 {
var failed, totalFailures uint64
var failedAllStats []SpecialChars
classes := strings.Split(strings.ToLower(commaList), ",")
for i, entry := range *allStats {
failed = 0
for _, class := range classes {
switch class {
case "crlf":
failed += entry.Crlf
case "lf":
failed += entry.Lf
case "tab":
failed += entry.Tab
case "bom8":
failed += entry.Bom8
case "bom16":
failed += entry.Bom16
case "nul":
failed += entry.Nul
default:
fmt.Fprintf(os.Stderr, "Unknown character passed to -f: %s\n", class)
}
}
if failed > 0 {
totalFailures += failed
(*allStats)[i].Failure = true
failedAllStats = append(failedAllStats, (*allStats)[i])
}
}
return totalFailures
}