scanner.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package json
  5. // JSON value parser state machine.
  6. // Just about at the limit of what is reasonable to write by hand.
  7. // Some parts are a bit tedious, but overall it nicely factors out the
  8. // otherwise common code from the multiple scanning functions
  9. // in this package (Compact, Indent, checkValid, NextValue, etc).
  10. //
  11. // This file starts with two simple examples using the scanner
  12. // before diving into the scanner itself.
  13. import "strconv"
  14. // checkValid verifies that data is valid JSON-encoded data.
  15. // scan is passed in for use by checkValid to avoid an allocation.
  16. func checkValid(data []byte, scan *Scanner) error {
  17. scan.Reset()
  18. for _, c := range data {
  19. scan.bytes++
  20. if scan.Step(scan, int(c)) == ScanError {
  21. return scan.Err
  22. }
  23. }
  24. if scan.EOF() == ScanError {
  25. return scan.Err
  26. }
  27. return nil
  28. }
  29. // Validate some alleged JSON. Return nil iff the JSON is valid.
  30. func Validate(data []byte) error {
  31. s := &Scanner{}
  32. return checkValid(data, s)
  33. }
  34. // NextValue splits data after the next whole JSON value,
  35. // returning that value and the bytes that follow it as separate slices.
  36. // scan is passed in for use by NextValue to avoid an allocation.
  37. func NextValue(data []byte, scan *Scanner) (value, rest []byte, err error) {
  38. scan.Reset()
  39. for i, c := range data {
  40. v := scan.Step(scan, int(c))
  41. if v >= ScanEnd {
  42. switch v {
  43. case ScanError:
  44. return nil, nil, scan.Err
  45. case ScanEnd:
  46. return data[0:i], data[i:], nil
  47. }
  48. }
  49. }
  50. if scan.EOF() == ScanError {
  51. return nil, nil, scan.Err
  52. }
  53. return data, nil, nil
  54. }
  55. // A SyntaxError is a description of a JSON syntax error.
  56. type SyntaxError struct {
  57. msg string // description of error
  58. Offset int64 // error occurred after reading Offset bytes
  59. }
  60. func (e *SyntaxError) Error() string { return e.msg }
  61. // A Scanner is a JSON scanning state machine.
  62. // Callers call scan.Reset() and then pass bytes in one at a time
  63. // by calling scan.Step(&scan, c) for each byte.
  64. // The return value, referred to as an opcode, tells the
  65. // caller about significant parsing events like beginning
  66. // and ending literals, objects, and arrays, so that the
  67. // caller can follow along if it wishes.
  68. // The return value ScanEnd indicates that a single top-level
  69. // JSON value has been completed, *before* the byte that
  70. // just got passed in. (The indication must be delayed in order
  71. // to recognize the end of numbers: is 123 a whole value or
  72. // the beginning of 12345e+6?).
  73. type Scanner struct {
  74. // The step is a func to be called to execute the next transition.
  75. // Also tried using an integer constant and a single func
  76. // with a switch, but using the func directly was 10% faster
  77. // on a 64-bit Mac Mini, and it's nicer to read.
  78. Step func(*Scanner, int) int
  79. // Reached end of top-level value.
  80. endTop bool
  81. // Stack of what we're in the middle of - array values, object keys, object values.
  82. parseState []int
  83. // Error that happened, if any.
  84. Err error
  85. // 1-byte redo (see undo method)
  86. redo bool
  87. redoCode int
  88. redoState func(*Scanner, int) int
  89. // total bytes consumed, updated by decoder.Decode
  90. bytes int64
  91. }
  92. // These values are returned by the state transition functions
  93. // assigned to Scanner.state and the method Scanner.EOF.
  94. // They give details about the current state of the scan that
  95. // callers might be interested to know about.
  96. // It is okay to ignore the return value of any particular
  97. // call to Scanner.state: if one call returns ScanError,
  98. // every subsequent call will return ScanError too.
  99. const (
  100. // Continue.
  101. ScanContinue = iota // uninteresting byte
  102. ScanBeginLiteral // end implied by next result != scanContinue
  103. ScanBeginObject // begin object
  104. ScanObjectKey // just finished object key (string)
  105. ScanObjectValue // just finished non-last object value
  106. ScanEndObject // end object (implies scanObjectValue if possible)
  107. ScanBeginArray // begin array
  108. ScanArrayValue // just finished array value
  109. ScanEndArray // end array (implies scanArrayValue if possible)
  110. ScanSkipSpace // space byte; can skip; known to be last "continue" result
  111. // Stop.
  112. ScanEnd // top-level value ended *before* this byte; known to be first "stop" result
  113. ScanError // hit an error, Scanner.err.
  114. )
  115. // These values are stored in the parseState stack.
  116. // They give the current state of a composite value
  117. // being scanned. If the parser is inside a nested value
  118. // the parseState describes the nested state, outermost at entry 0.
  119. const (
  120. parseObjectKey = iota // parsing object key (before colon)
  121. parseObjectValue // parsing object value (after colon)
  122. parseArrayValue // parsing array value
  123. )
  124. // reset prepares the scanner for use.
  125. // It must be called before calling s.step.
  126. func (s *Scanner) Reset() {
  127. s.Step = stateBeginValue
  128. s.parseState = s.parseState[0:0]
  129. s.Err = nil
  130. s.redo = false
  131. s.endTop = false
  132. }
  133. // EOF tells the scanner that the end of input has been reached.
  134. // It returns a scan status just as s.step does.
  135. func (s *Scanner) EOF() int {
  136. if s.Err != nil {
  137. return ScanError
  138. }
  139. if s.endTop {
  140. return ScanEnd
  141. }
  142. s.Step(s, ' ')
  143. if s.endTop {
  144. return ScanEnd
  145. }
  146. if s.Err == nil {
  147. s.Err = &SyntaxError{"unexpected end of JSON input", s.bytes}
  148. }
  149. return ScanError
  150. }
  151. // pushParseState pushes a new parse state p onto the parse stack.
  152. func (s *Scanner) pushParseState(p int) {
  153. s.parseState = append(s.parseState, p)
  154. }
  155. // popParseState pops a parse state (already obtained) off the stack
  156. // and updates s.step accordingly.
  157. func (s *Scanner) popParseState() {
  158. n := len(s.parseState) - 1
  159. s.parseState = s.parseState[0:n]
  160. s.redo = false
  161. if n == 0 {
  162. s.Step = stateEndTop
  163. s.endTop = true
  164. } else {
  165. s.Step = stateEndValue
  166. }
  167. }
  168. func isSpace(c rune) bool {
  169. return c == ' ' || c == '\t' || c == '\r' || c == '\n'
  170. }
  171. // stateBeginValueOrEmpty is the state after reading `[`.
  172. func stateBeginValueOrEmpty(s *Scanner, c int) int {
  173. if c <= ' ' && isSpace(rune(c)) {
  174. return ScanSkipSpace
  175. }
  176. if c == ']' {
  177. return stateEndValue(s, c)
  178. }
  179. return stateBeginValue(s, c)
  180. }
  181. // stateBeginValue is the state at the beginning of the input.
  182. func stateBeginValue(s *Scanner, c int) int {
  183. if c <= ' ' && isSpace(rune(c)) {
  184. return ScanSkipSpace
  185. }
  186. switch c {
  187. case '{':
  188. s.Step = stateBeginStringOrEmpty
  189. s.pushParseState(parseObjectKey)
  190. return ScanBeginObject
  191. case '[':
  192. s.Step = stateBeginValueOrEmpty
  193. s.pushParseState(parseArrayValue)
  194. return ScanBeginArray
  195. case '"':
  196. s.Step = stateInString
  197. return ScanBeginLiteral
  198. case '-':
  199. s.Step = stateNeg
  200. return ScanBeginLiteral
  201. case '0': // beginning of 0.123
  202. s.Step = state0
  203. return ScanBeginLiteral
  204. case 't': // beginning of true
  205. s.Step = stateT
  206. return ScanBeginLiteral
  207. case 'f': // beginning of false
  208. s.Step = stateF
  209. return ScanBeginLiteral
  210. case 'n': // beginning of null
  211. s.Step = stateN
  212. return ScanBeginLiteral
  213. }
  214. if '1' <= c && c <= '9' { // beginning of 1234.5
  215. s.Step = state1
  216. return ScanBeginLiteral
  217. }
  218. return s.error(c, "looking for beginning of value")
  219. }
  220. // stateBeginStringOrEmpty is the state after reading `{`.
  221. func stateBeginStringOrEmpty(s *Scanner, c int) int {
  222. if c <= ' ' && isSpace(rune(c)) {
  223. return ScanSkipSpace
  224. }
  225. if c == '}' {
  226. n := len(s.parseState)
  227. s.parseState[n-1] = parseObjectValue
  228. return stateEndValue(s, c)
  229. }
  230. return stateBeginString(s, c)
  231. }
  232. // stateBeginString is the state after reading `{"key": value,`.
  233. func stateBeginString(s *Scanner, c int) int {
  234. if c <= ' ' && isSpace(rune(c)) {
  235. return ScanSkipSpace
  236. }
  237. if c == '"' {
  238. s.Step = stateInString
  239. return ScanBeginLiteral
  240. }
  241. return s.error(c, "looking for beginning of object key string")
  242. }
  243. // stateEndValue is the state after completing a value,
  244. // such as after reading `{}` or `true` or `["x"`.
  245. func stateEndValue(s *Scanner, c int) int {
  246. n := len(s.parseState)
  247. if n == 0 {
  248. // Completed top-level before the current byte.
  249. s.Step = stateEndTop
  250. s.endTop = true
  251. return stateEndTop(s, c)
  252. }
  253. if c <= ' ' && isSpace(rune(c)) {
  254. s.Step = stateEndValue
  255. return ScanSkipSpace
  256. }
  257. ps := s.parseState[n-1]
  258. switch ps {
  259. case parseObjectKey:
  260. if c == ':' {
  261. s.parseState[n-1] = parseObjectValue
  262. s.Step = stateBeginValue
  263. return ScanObjectKey
  264. }
  265. return s.error(c, "after object key")
  266. case parseObjectValue:
  267. if c == ',' {
  268. s.parseState[n-1] = parseObjectKey
  269. s.Step = stateBeginString
  270. return ScanObjectValue
  271. }
  272. if c == '}' {
  273. s.popParseState()
  274. return ScanEndObject
  275. }
  276. return s.error(c, "after object key:value pair")
  277. case parseArrayValue:
  278. if c == ',' {
  279. s.Step = stateBeginValue
  280. return ScanArrayValue
  281. }
  282. if c == ']' {
  283. s.popParseState()
  284. return ScanEndArray
  285. }
  286. return s.error(c, "after array element")
  287. }
  288. return s.error(c, "")
  289. }
  290. // stateEndTop is the state after finishing the top-level value,
  291. // such as after reading `{}` or `[1,2,3]`.
  292. // Only space characters should be seen now.
  293. func stateEndTop(s *Scanner, c int) int {
  294. if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
  295. // Complain about non-space byte on next call.
  296. s.error(c, "after top-level value")
  297. }
  298. return ScanEnd
  299. }
  300. // stateInString is the state after reading `"`.
  301. func stateInString(s *Scanner, c int) int {
  302. if c == '"' {
  303. s.Step = stateEndValue
  304. return ScanContinue
  305. }
  306. if c == '\\' {
  307. s.Step = stateInStringEsc
  308. return ScanContinue
  309. }
  310. if c < 0x20 {
  311. return s.error(c, "in string literal")
  312. }
  313. return ScanContinue
  314. }
  315. // stateInStringEsc is the state after reading `"\` during a quoted string.
  316. func stateInStringEsc(s *Scanner, c int) int {
  317. switch c {
  318. case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
  319. s.Step = stateInString
  320. return ScanContinue
  321. }
  322. if c == 'u' {
  323. s.Step = stateInStringEscU
  324. return ScanContinue
  325. }
  326. return s.error(c, "in string escape code")
  327. }
  328. // stateInStringEscU is the state after reading `"\u` during a quoted string.
  329. func stateInStringEscU(s *Scanner, c int) int {
  330. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  331. s.Step = stateInStringEscU1
  332. return ScanContinue
  333. }
  334. // numbers
  335. return s.error(c, "in \\u hexadecimal character escape")
  336. }
  337. // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
  338. func stateInStringEscU1(s *Scanner, c int) int {
  339. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  340. s.Step = stateInStringEscU12
  341. return ScanContinue
  342. }
  343. // numbers
  344. return s.error(c, "in \\u hexadecimal character escape")
  345. }
  346. // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
  347. func stateInStringEscU12(s *Scanner, c int) int {
  348. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  349. s.Step = stateInStringEscU123
  350. return ScanContinue
  351. }
  352. // numbers
  353. return s.error(c, "in \\u hexadecimal character escape")
  354. }
  355. // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
  356. func stateInStringEscU123(s *Scanner, c int) int {
  357. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  358. s.Step = stateInString
  359. return ScanContinue
  360. }
  361. // numbers
  362. return s.error(c, "in \\u hexadecimal character escape")
  363. }
  364. // stateNeg is the state after reading `-` during a number.
  365. func stateNeg(s *Scanner, c int) int {
  366. if c == '0' {
  367. s.Step = state0
  368. return ScanContinue
  369. }
  370. if '1' <= c && c <= '9' {
  371. s.Step = state1
  372. return ScanContinue
  373. }
  374. return s.error(c, "in numeric literal")
  375. }
  376. // state1 is the state after reading a non-zero integer during a number,
  377. // such as after reading `1` or `100` but not `0`.
  378. func state1(s *Scanner, c int) int {
  379. if '0' <= c && c <= '9' {
  380. s.Step = state1
  381. return ScanContinue
  382. }
  383. return state0(s, c)
  384. }
  385. // state0 is the state after reading `0` during a number.
  386. func state0(s *Scanner, c int) int {
  387. if c == '.' {
  388. s.Step = stateDot
  389. return ScanContinue
  390. }
  391. if c == 'e' || c == 'E' {
  392. s.Step = stateE
  393. return ScanContinue
  394. }
  395. return stateEndValue(s, c)
  396. }
  397. // stateDot is the state after reading the integer and decimal point in a number,
  398. // such as after reading `1.`.
  399. func stateDot(s *Scanner, c int) int {
  400. if '0' <= c && c <= '9' {
  401. s.Step = stateDot0
  402. return ScanContinue
  403. }
  404. return s.error(c, "after decimal point in numeric literal")
  405. }
  406. // stateDot0 is the state after reading the integer, decimal point, and subsequent
  407. // digits of a number, such as after reading `3.14`.
  408. func stateDot0(s *Scanner, c int) int {
  409. if '0' <= c && c <= '9' {
  410. s.Step = stateDot0
  411. return ScanContinue
  412. }
  413. if c == 'e' || c == 'E' {
  414. s.Step = stateE
  415. return ScanContinue
  416. }
  417. return stateEndValue(s, c)
  418. }
  419. // stateE is the state after reading the mantissa and e in a number,
  420. // such as after reading `314e` or `0.314e`.
  421. func stateE(s *Scanner, c int) int {
  422. if c == '+' {
  423. s.Step = stateESign
  424. return ScanContinue
  425. }
  426. if c == '-' {
  427. s.Step = stateESign
  428. return ScanContinue
  429. }
  430. return stateESign(s, c)
  431. }
  432. // stateESign is the state after reading the mantissa, e, and sign in a number,
  433. // such as after reading `314e-` or `0.314e+`.
  434. func stateESign(s *Scanner, c int) int {
  435. if '0' <= c && c <= '9' {
  436. s.Step = stateE0
  437. return ScanContinue
  438. }
  439. return s.error(c, "in exponent of numeric literal")
  440. }
  441. // stateE0 is the state after reading the mantissa, e, optional sign,
  442. // and at least one digit of the exponent in a number,
  443. // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
  444. func stateE0(s *Scanner, c int) int {
  445. if '0' <= c && c <= '9' {
  446. s.Step = stateE0
  447. return ScanContinue
  448. }
  449. return stateEndValue(s, c)
  450. }
  451. // stateT is the state after reading `t`.
  452. func stateT(s *Scanner, c int) int {
  453. if c == 'r' {
  454. s.Step = stateTr
  455. return ScanContinue
  456. }
  457. return s.error(c, "in literal true (expecting 'r')")
  458. }
  459. // stateTr is the state after reading `tr`.
  460. func stateTr(s *Scanner, c int) int {
  461. if c == 'u' {
  462. s.Step = stateTru
  463. return ScanContinue
  464. }
  465. return s.error(c, "in literal true (expecting 'u')")
  466. }
  467. // stateTru is the state after reading `tru`.
  468. func stateTru(s *Scanner, c int) int {
  469. if c == 'e' {
  470. s.Step = stateEndValue
  471. return ScanContinue
  472. }
  473. return s.error(c, "in literal true (expecting 'e')")
  474. }
  475. // stateF is the state after reading `f`.
  476. func stateF(s *Scanner, c int) int {
  477. if c == 'a' {
  478. s.Step = stateFa
  479. return ScanContinue
  480. }
  481. return s.error(c, "in literal false (expecting 'a')")
  482. }
  483. // stateFa is the state after reading `fa`.
  484. func stateFa(s *Scanner, c int) int {
  485. if c == 'l' {
  486. s.Step = stateFal
  487. return ScanContinue
  488. }
  489. return s.error(c, "in literal false (expecting 'l')")
  490. }
  491. // stateFal is the state after reading `fal`.
  492. func stateFal(s *Scanner, c int) int {
  493. if c == 's' {
  494. s.Step = stateFals
  495. return ScanContinue
  496. }
  497. return s.error(c, "in literal false (expecting 's')")
  498. }
  499. // stateFals is the state after reading `fals`.
  500. func stateFals(s *Scanner, c int) int {
  501. if c == 'e' {
  502. s.Step = stateEndValue
  503. return ScanContinue
  504. }
  505. return s.error(c, "in literal false (expecting 'e')")
  506. }
  507. // stateN is the state after reading `n`.
  508. func stateN(s *Scanner, c int) int {
  509. if c == 'u' {
  510. s.Step = stateNu
  511. return ScanContinue
  512. }
  513. return s.error(c, "in literal null (expecting 'u')")
  514. }
  515. // stateNu is the state after reading `nu`.
  516. func stateNu(s *Scanner, c int) int {
  517. if c == 'l' {
  518. s.Step = stateNul
  519. return ScanContinue
  520. }
  521. return s.error(c, "in literal null (expecting 'l')")
  522. }
  523. // stateNul is the state after reading `nul`.
  524. func stateNul(s *Scanner, c int) int {
  525. if c == 'l' {
  526. s.Step = stateEndValue
  527. return ScanContinue
  528. }
  529. return s.error(c, "in literal null (expecting 'l')")
  530. }
  531. // stateError is the state after reaching a syntax error,
  532. // such as after reading `[1}` or `5.1.2`.
  533. func stateError(s *Scanner, c int) int {
  534. return ScanError
  535. }
  536. // error records an error and switches to the error state.
  537. func (s *Scanner) error(c int, context string) int {
  538. s.Step = stateError
  539. s.Err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
  540. return ScanError
  541. }
  542. // quoteChar formats c as a quoted character literal
  543. func quoteChar(c int) string {
  544. // special cases - different from quoted strings
  545. if c == '\'' {
  546. return `'\''`
  547. }
  548. if c == '"' {
  549. return `'"'`
  550. }
  551. // use quoted string with different quotation marks
  552. s := strconv.Quote(string(c))
  553. return "'" + s[1:len(s)-1] + "'"
  554. }
  555. // undo causes the scanner to return scanCode from the next state transition.
  556. // This gives callers a simple 1-byte undo mechanism.
  557. func (s *Scanner) undo(scanCode int) {
  558. if s.redo {
  559. panic("json: invalid use of scanner")
  560. }
  561. s.redoCode = scanCode
  562. s.redoState = s.Step
  563. s.Step = stateRedo
  564. s.redo = true
  565. }
  566. // stateRedo helps implement the scanner's 1-byte undo.
  567. func stateRedo(s *Scanner, c int) int {
  568. s.redo = false
  569. s.Step = s.redoState
  570. return s.redoCode
  571. }