Files
snapd/jsonutil/safejson/safejson.go

203 lines
5.1 KiB
Go
Raw Permalink Blame History

// -*- Mode: Go; indent-tabs-mode: t -*-
/*
* Copyright (C) 2018 Canonical Ltd
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 3 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package safejson
import (
"fmt"
"strconv"
"unicode"
"unicode/utf16"
"unicode/utf8"
"github.com/snapcore/snapd/strutil"
)
// String accepts any valid JSON string. Its Clean method will remove
// characters that aren't expected in a short descriptive text.
// I.e.: Cc, Co, Cf, Cs, noncharacters, and <20> (U+FFFD, the replacement
// character) are removed.
type String struct {
s string
}
func (str *String) UnmarshalJSON(in []byte) (err error) {
str.s, err = unmarshal(in, uOpt{})
return
}
// Clean returns the string, with Cc, Co, Cf, Cs, noncharacters,
// and <20> (U+FFFD) removed.
func (str String) Clean() string {
return str.s
}
// Paragraph accepts any valid JSON string. Its Clean method will remove
// characters that aren't expected in a long descriptive text.
// I.e.: Cc (except for \n), Co, Cf, Cs, noncharacters, and <20> (U+FFFD,
// the replacement character) are removed.
type Paragraph struct {
s string
}
func (par *Paragraph) UnmarshalJSON(in []byte) (err error) {
par.s, err = unmarshal(in, uOpt{nlOK: true})
return
}
// Clean returns the string, with Cc minus \n, Co, Cf, Cs, noncharacters,
// and <20> (U+FFFD) removed.
func (par Paragraph) Clean() string {
return par.s
}
func unescapeUCS2(in []byte) (rune, bool) {
if len(in) < 6 || in[0] != '\\' || in[1] != 'u' {
return -1, false
}
u, err := strconv.ParseUint(string(in[2:6]), 16, 32)
if err != nil {
return -1, false
}
return rune(u), true
}
type uOpt struct {
nlOK bool
simple bool
}
func unmarshal(in []byte, o uOpt) (string, error) {
// heavily based on (inspired by?) unquoteBytes from encoding/json
if len(in) < 2 || in[0] != '"' || in[len(in)-1] != '"' {
// maybe it's a null and that's alright
if len(in) == 4 && in[0] == 'n' && in[1] == 'u' && in[2] == 'l' && in[3] == 'l' {
return "", nil
}
return "", fmt.Errorf("missing string delimiters: %q", in)
}
// prune the quotes
in = in[1 : len(in)-1]
i := 0
// try the fast track
for i < len(in) {
// 0x00..0x19 is the first of Cc
// 0x20..0x7e is all of printable ASCII (minus control chars)
if in[i] < 0x20 || in[i] > 0x7e || in[i] == '\\' || in[i] == '"' {
break
}
i++
}
if i == len(in) {
// wee
return string(in), nil
}
if o.simple {
return "", fmt.Errorf("character %q in string %q unsupported for this value", in[i], in)
}
// in[i] is the first problematic one
out := make([]byte, i, len(in)+2*utf8.UTFMax)
copy(out, in)
var r, r2 rune
var n int
var c byte
var ubuf [utf8.UTFMax]byte
var ok bool
for i < len(in) {
c = in[i]
switch {
case c == '"':
return "", fmt.Errorf("unexpected unescaped quote at %d in \"%s\"", i, in)
case c < 0x20:
return "", fmt.Errorf("unexpected control character at %d in %q", i, in)
case c == '\\':
// handle escapes
i++
if i == len(in) {
return "", fmt.Errorf("unexpected end of string (trailing backslash) in \"%s\"", in)
}
switch in[i] {
case 'u':
// oh dear, a unicode wotsit
r, ok = unescapeUCS2(in[i-1:])
if !ok {
x := in[i-1:]
if len(x) > 6 {
x = x[:6]
}
return "", fmt.Errorf(`badly formed \u escape %q at %d of "%s"`, x, i, in)
}
i += 5
if utf16.IsSurrogate(r) {
// sigh
r2, ok = unescapeUCS2(in[i:])
if !ok {
x := in[i:]
if len(x) > 6 {
x = x[:6]
}
return "", fmt.Errorf(`badly formed \u escape %q at %d of "%s"`, x, i, in)
}
i += 6
r = utf16.DecodeRune(r, r2)
}
if r <= 0x9f {
// otherwise, it's Cc (both halves, as we're looking at runes)
if (o.nlOK && r == '\n') || (r >= 0x20 && r <= 0x7e) {
out = append(out, byte(r))
}
} else if r != unicode.ReplacementChar && !unicode.Is(strutil.Ctrl, r) {
n = utf8.EncodeRune(ubuf[:], r)
out = append(out, ubuf[:n]...)
}
case 'b', 'f', 'r', 't':
// do nothing
i++
case 'n':
if o.nlOK {
out = append(out, '\n')
}
i++
case '"', '/', '\\':
// the spec says just ", / and \ can be backslash-escaped
// but go adds ' to the list (in unquoteBytes)
out = append(out, in[i])
i++
default:
return "", fmt.Errorf(`unknown escape '%c' at %d of "%s"`, in[i], i, in)
}
case c <= 0x7e:
// printable ASCII, except " or \
out = append(out, c)
i++
default:
r, n = utf8.DecodeRune(in[i:])
j := i + n
if r > 0x9f && r != unicode.ReplacementChar && !unicode.Is(strutil.Ctrl, r) {
out = append(out, in[i:j]...)
}
i = j
}
}
return string(out), nil
}