Coverage report: /development/source/library/org/datagraph/spocq-shard/src/core/encoding/ssf-sparql-lexer.lisp
| Kind | Covered | All | % |
| expression | 934 | 1297 | 72.0 |
| branch | 110 | 162 | 67.9 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: lisp; Syntax: ansi-common-lisp; Base: 10; Package: org.datagraph.spocq.implementation; -*-
3
(in-package :org.datagraph.spocq.implementation)
5
;;; !!! todo : import the lower case spocq.a terms into spocq.s and use them directly
9
;; make sure these wilbur artefacts are gone
10
(eval-when (:load-toplevel :compile-toplevel :execute)
11
;; not legal (set-macro-character #\! nil t)
12
(set-syntax-from-char #\! #\a))
16
(eval-when (:load-toplevel :compile-toplevel :execute)
17
(defparameter *sparql-terminal-keywords*
140
(defparameter *case-insensitive-keywords* nil)
141
(defparameter *case-sensitive-keywords* nil)
142
(defparameter *terminal-punctuation* nil)
143
(flet ((exported (name)
144
(let ((symbol (intern name :spocq.s)))
145
(export symbol :spocq.s)
147
(setq *terminal-punctuation*
149
'( "_:" "||" "&&" "^^" ">=" "<=" "<<" ">>"
150
"(" ")" "{" "}" "." "," "+" "-" "[" "]"
151
;; ":" left integral to names to reduce syntax ambiguity
152
"<" ">" "=" "!=" ";" "?" "$" "/" "*" "!" "|" "^")))
153
(setq *case-insensitive-keywords*
154
(sort (mapcar #'exported *sparql-terminal-keywords*)
155
#'> :key #'(lambda (s) (length (symbol-name s)))))
156
(setq *case-sensitive-keywords*
157
(mapcar #'exported '("a")))))
160
(defun find-builtin-operator (symbol)
161
(or (find-symbol (string symbol) :spocq.a)
162
(find-symbol (string-downcase symbol) :spocq.a)))
164
(defun prefix-namespace (prefix)
165
(or (rest (assoc prefix (namespace-bindings) :test #'string-equal))
166
;; this was to allow an implicit null prefix to permit second parse of, eg. 't : byte'
167
;; in the case where qualified names were tokenized as components. no longer needed
168
;; (when (equal prefix "") "")
169
(restart-case (spocq.e:unbound-prefix-error :expression prefix)
170
(use-value (namespace-namestring)
171
(setf (prefix-namespace prefix) namespace-namestring)))))
174
(defun (setf prefix-namespace) (namestring prefix)
175
(setf (namespace-bindings)
176
(acons prefix namestring (namespace-bindings)))
179
(defun default-namespace ()
180
(or (rest (assoc "" (namespace-bindings) :test #'string-equal))
181
(error "No default namespace defined.")))
183
(defun (setf default-namespace) (namestring)
184
(setf (prefix-namespace "") namestring))
186
;;; regular-expressions
188
(defparameter *langtag-scanner*
189
(cl-ppcre:create-scanner (cl-ppcre::maybe-coerce-to-simple-string "^@[a-zA-Z]+(-[a-zA-Z0-9]+)*$"))
190
"A regular expression scanner for the langtag bnf production")
192
(defparameter *language-tag-scanner*
193
(cl-ppcre:create-scanner (cl-ppcre::maybe-coerce-to-simple-string "^[a-zA-Z]+(-[a-zA-Z0-9]+)*$"))
194
"A regular expression scanner for the language tag proper")
196
(defparameter *langtag-initial-scanner*
197
(cl-ppcre:create-scanner "^@[a-zA-Z]+(-[a-zA-Z0-9]+)*"))
199
(defparameter *langdir-scanner*
200
(cl-ppcre:create-scanner (cl-ppcre::maybe-coerce-to-simple-string "^[a-zA-Z]+$"))
201
"A regular expression scanner for the language direction proper")
203
(defparameter *iri-scanner*
204
; very incomplete, but simple and sufficient to isolate them
205
;;(cl-ppcre:create-scanner "^[^<>\"{}\\s]*")
206
;; reproducting bnf production 139 (IRIREF)
207
(cl-ppcre:create-scanner
208
`(:sequence :start-anchor
209
(:greedy-repetition 0 nil (:inverted-char-class #\< #\> #\" #\{ #\} #\| #\^ #\` #\\ :whitespace-char-class (:range ,(code-char #x00) ,(code-char #x20))))))
213
(cl-ppcre:create-scanner
214
"(([^:/?#<>\"{}\\s]+):)?(//([^/?#<>\"{\\s}]*))?([^?#<>\"{}\\s]*)(\\?([^#<>\"{}\\s]*))?(#([^<>\\s]*))?"))
216
(defparameter *terminated-iri-scanner*
217
;; reproducting bnf production 139 (IRIREF)
218
;; but note, which rfc is clear, the rdf abstract syntax (6.4) claims compatibility with the xml schema spec which allows whatever one wants
219
;; http://tools.ietf.org/html/rfc3986
220
;; http://www.w3.org/TR/rdf-concepts/
221
;; http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#anyURI
222
;; see, below, the alternative
223
(cl-ppcre:create-scanner `(:sequence :start-anchor
224
(:greedy-repetition 0 nil (:inverted-char-class #\< #\> #\" #\{ #\} #\| #\^ #\` #\\ :whitespace-char-class (:range ,(code-char #x00) ,(code-char #x20))))
227
(defparameter *terminated-iri-scanner*
228
; very incomplete, but simple and sufficient to isolate them
229
(cl-ppcre:create-scanner "^([^<>\"{}\\s])*>"))
231
(defun terminated-iri-scanner (string start end)
232
(cl-ppcre:scan *terminated-iri-scanner* string :start start :end end))
234
(defun iri-scanner (string start end)
235
(cl-ppcre:scan *iri-scanner* string :start start :end end))
237
(defparameter *any-iri-scanner*
238
; very incomplete, but simple and sufficient to isolate them
239
(cl-ppcre:create-scanner "^[^>]*"))
241
(defparameter *any-terminated-iri-scanner*
242
;; http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#anyURI
244
(cl-ppcre:create-scanner `(:sequence :start-anchor
245
(:greedy-repetition 0 nil (:inverted-char-class #\>))
248
(defparameter *http-url-namestring-scanner*
249
(cl-ppcre:create-scanner
250
`(:sequence :start-anchor
252
(:greedy-repetition 0 nil (:inverted-char-class #\< #\> #\" #\{ #\} #\| #\^ #\` #\\ :whitespace-char-class (:range ,(code-char #x00) ,(code-char #x20)))))))
254
(defun http-url-namestring-scanner (string start end)
255
(cl-ppcre:scan *http-url-namestring-scanner* string :start start :end end))
257
(setf (cl-ppcre:parse-tree-synonym 'uuid)
258
(cl-ppcre:parse-string "[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}"))
259
(defparameter *uuid-scanner*
260
(cl-ppcre:create-scanner '(:sequence :start-anchor uuid :end-anchor)))
262
(defparameter *uuid-scanner*
263
(cl-ppcre:create-scanner "[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}"))
266
(defparameter *decimal-scanner*
267
(cl-ppcre:create-scanner "^[\\+\\-]?(([0-9]+\\.[0-9]+)|(\\.[0-9]+))$")
268
"The pattern now precludes a terminating '.' and requires a succeeding digit,
269
as per [http://lists.w3.org/Archives/Public/public-rdf-dawg/2012JanMar/0161.html].")
271
(defparameter *decimal-initial-scanner*
272
(cl-ppcre:create-scanner "^[\\+\\-]?(([0-9]+\\.[0-9]+)|(\\.[0-9]+))"))
274
(defparameter *float-scanner*
275
(cl-ppcre:create-scanner "^[\\+\\-]?(([0-9]+\\.[0-9]+)|(\\.[0-9]+)|([0-9]+))[eE][+-]?[0-9]+$"))
277
(defparameter *float-initial-scanner*
278
(cl-ppcre:create-scanner "^[\\+\\-]?(([0-9]+\\.[0-9]+)|(\\.[0-9]+)|([0-9]+))[eE][+-]?[0-9]+"))
281
(setf (cl-ppcre:parse-tree-synonym 'integer)
282
(cl-ppcre:parse-string "[\\+\\-]?[0-9]+"))
284
(defparameter *integer-scanner*
285
(cl-ppcre:create-scanner (coerce "^[\\+\\-]?[0-9]+$" 'simple-string)))
287
(defparameter *integer-initial-scanner*
288
(cl-ppcre:create-scanner "^[\\+\\-]?[0-9]+"))
294
(defun is-float-string (string)
295
(when (stringp string)
296
(multiple-value-bind (start end)
297
(cl-ppcre:scan *float-scanner* string)
298
(and start (= end (length string))))))
300
(defun is-integer-string (string)
301
(when (stringp string)
302
(multiple-value-bind (start end)
303
(cl-ppcre:scan *integer-scanner* string)
304
(and start (zerop start) (= end (length string))))))
306
(defun is-decimal-string (string)
307
(when (stringp string)
308
(multiple-value-bind (start end)
309
(cl-ppcre:scan *decimal-scanner* string)
310
(and start (zerop start) (= end (length string))))))
313
(defun is-integer_positive (token) (integerp token))
314
(defun is-integer_negative (token) (integerp token))
315
(defun is-integer (token) (integerp token))
317
(defun is-decimal (token) (rationalp token))
318
(defun is-decimal_negative (token) (rationalp token))
319
(defun is-decimal_positive (token) (rationalp token))
321
(defun is-double (token) (floatp token))
322
(defun is-double_negative (token) (floatp token))
323
(defun is-double_positive (token) (floatp token))
325
(defun is-iri_namestring (string)
326
(when (and (stringp string) (not (gethash string *literal-string-tokens*)))
327
(multiple-value-bind (start end)
328
(cl-ppcre:scan *iri-scanner* string)
329
(and start (= end (length string))))))
330
;;; (is-iri_namestring "http://test.com")
332
(defparameter *iri-prefix-scanner*
333
(cl-ppcre:create-scanner "^[a-zA-Z][a-zA-Z0-9\\.\\-\\+]*://"))
334
;;; (cl-ppcre:scan *iri-prefix-scanner* "http://")
336
(defun is-absolute-iri-string (lexical-form)
337
(and (stringp lexical-form)
338
(or (when (cl-ppcre:scan *iri-prefix-scanner* lexical-form) t)
339
(loop with length = (length lexical-form)
340
for prefix in '("urn:")
341
when (string-equal prefix lexical-form :end2 (min length (length prefix)))
344
(defun is-http-url-namestring (lexical-form)
345
(and (stringp lexical-form)
346
(multiple-value-bind (start end)
347
(http-url-namestring-scanner lexical-form 0 (length lexical-form))
348
(and start (= end (length lexical-form))))))
351
(defun is-uuid-string (lexical-form &key (start 0))
352
(and (stringp lexical-form)
353
(when (cl-ppcre:scan *uuid-scanner* lexical-form :start start) t)))
355
(defun is-langtag (string)
356
(when (stringp string)
357
(multiple-value-bind (start end)
358
(cl-ppcre:scan *langtag-scanner* string)
359
(and start (= end (length string))))))
360
;;; (and (is-langtag "@en") (is-langtag "@en-en") (not (is-langtag "@-en")) (not (is-langtag "@en9")))
362
(defun is-language-tag (string)
363
(when (stringp string)
364
(multiple-value-bind (start end)
365
(cl-ppcre:scan *language-tag-scanner* string)
366
(and start (= end (length string))))))
368
;;; this needs to combine with logic in the reduction operators to construct a
369
;;; directional language tag and constraint the iri referenc eto be rdf:langString
370
(defun is-langdir (string)
371
(when (stringp string)
372
(multiple-value-bind (start end)
373
(cl-ppcre:scan *langdir-scanner* string)
374
(and start (= end (length string))))))
378
;;; restructured regular expressions
379
;;; the name syntax is reduced to PN_LOCAL, PNAME_LN, and PNAME_NS. this eliminates the distinct
380
;;; PN_PREFIX and ':' in order to eliminate the syntax ambiguity which permits them both in
381
;;; combination and as isolated term and ':' tokens.
382
;;; see http://weitz.de/cl-ppcre/
384
(let* ((pn_chars_base `(:char-class (:range #\A #\Z) (:range #\a #\z)
385
(:range ,(code-char #x00C0) ,(code-char #x00D6)) (:range ,(code-char #x00D8) ,(code-char #x00F6)) (:range ,(code-char #x00F8) ,(code-char #x02FF))
386
(:range ,(code-char #x0370) ,(code-char #x037D)) (:range ,(code-char #x037F) ,(code-char #x1FFF)) (:range ,(code-char #x200C) ,(code-char #x200D))
387
(:range ,(code-char #x2070) ,(code-char #x218F)) (:range ,(code-char #x2C00) ,(code-char #x2FEF)) (:range ,(code-char #x3001) ,(code-char #xDF77))
388
(:range ,(code-char #xF900) ,(code-char #xFDCF)) (:range ,(code-char #xFDF0) ,(code-char #xFFFD))
389
;,@(when (< #xEFFFF char-code-limit) (list (list :range (code-char #x10000) (code-char #xEFFFF))))
391
(pn_chars_u `(:alternation #\_ pn_chars_base))
392
(pn_chars `(:alternation pn_chars_u #\- (:char-class (:range #\0 #\9) ,(code-char #x00B7) (:range ,(code-char #x0300) ,(code-char #x036F)) (:range ,(code-char #x203F) ,(code-char #x2040)))))
393
(pn_prefix `(:sequence pn_chars_base (:greedy-repetition 0 1 (:sequence (:greedy-repetition 0 nil (:alternation pn_chars #\.)) pn_chars))))
394
(hex '(:char-class (:range #\0 #\9) (:range #\a #\f) (:range #\A #\F)))
395
(plx `(:alternation (:sequence #\% hex hex)
396
(:sequence #\\ (:alternation #\_ #\~ #\. #\- #\! #\$ #\& #\' #\( #\) #\* #\+ #\, #\; #\= #\/ #\? #\# #\@ #\%))))
397
(pn_local `(:sequence (:alternation pn_chars_u (:char-class (:range #\0 #\9)) #\: plx)
398
(:greedy-repetition 0 1 (:sequence (:greedy-repetition 0 nil (:alternation pn_chars #\. #\: plx))
399
(:alternation pn_chars #\: plx)))))
400
(pname_ns `(:sequence (:greedy-repetition 0 1 pn_prefix) #\:)))
401
(setf (cl-ppcre:parse-tree-synonym 'pn_chars_base) pn_chars_base)
402
(setf (cl-ppcre:parse-tree-synonym 'pn_chars_u) pn_chars_u)
403
(setf (cl-ppcre:parse-tree-synonym 'pn_chars) pn_chars)
404
(setf (cl-ppcre:parse-tree-synonym 'pn_prefix) pn_prefix)
405
(setf (cl-ppcre:parse-tree-synonym 'hex) hex)
406
(setf (cl-ppcre:parse-tree-synonym 'plx) plx)
407
(setf (cl-ppcre:parse-tree-synonym 'pn_local) pn_local)
408
(setf (cl-ppcre:parse-tree-synonym 'pname_ns) pname_ns))
411
(setf (cl-ppcre:parse-tree-synonym 'varname)
412
`(:sequence (:alternation pn_chars_u (:char-class (:range #\0 #\9)))
413
(:greedy-repetition 0 nil
414
(:alternation pn_chars_u
415
(:char-class (:range #\0 #\9) ,(code-char #x00B7)
416
(:range ,(code-char #x0300) ,(code-char #x036F))
417
(:range ,(code-char #x203F) ,(code-char #x2040)))))))
419
(defparameter *varname-scanner*
420
(cl-ppcre:create-scanner `(:sequence :start-anchor
421
(:alternation pn_chars_u (:char-class (:range #\0 #\9)))
422
(:greedy-repetition 0 nil
423
(:alternation pn_chars_u
424
(:char-class (:range #\0 #\9) ,(code-char #x00B7)
425
(:range ,(code-char #x0300) ,(code-char #x036F)) (:range ,(code-char #x203F) ,(code-char #x2040))))))))
427
(defun is-varname (string)
428
"Permit simple symbols, parsed as strings"
429
(when (stringp string)
430
(multiple-value-bind (start end)
431
(cl-ppcre:scan *varname-scanner* string)
432
(and start (= end (length string))))))
434
(defparameter *var-scanner*
435
(cl-ppcre:create-scanner `(:sequence :start-anchor
436
(:alternation #\? #\$)
437
(:alternation pn_chars_u (:char-class (:range #\0 #\9)))
438
(:greedy-repetition 0 nil
439
(:alternation pn_chars_u
440
(:char-class (:range #\0 #\9) ,(code-char #x00B7)
441
(:range ,(code-char #x0300) ,(code-char #x036F)) (:range ,(code-char #x203F) ,(code-char #x2040))))))))
442
(defun |IS-Var| (var)
443
"Permits variable names - symbols with '?' or '$' prefix, returned as symbols"
444
(and (symbolp var) (eq (symbol-package var) *variable-package*)))
447
(defparameter *blank_node_label-scanner*
448
(cl-ppcre:create-scanner `(:sequence :start-anchor (:alternation pn_chars_u (:char-class (:range #\0 #\9)))
449
(:greedy-repetition 0 1 (:sequence (:greedy-repetition 0 nil (:alternation pn_chars #\.)) pn_chars)))))
451
(defparameter *blank_node-scanner*
452
(cl-ppcre:create-scanner `(:sequence :start-anchor #\_ #\: (:alternation pn_chars_u (:char-class (:range #\0 #\9)))
453
(:greedy-repetition 0 1 (:sequence (:greedy-repetition 0 nil (:alternation pn_chars #\.)) pn_chars)))))
455
(defun is-blank_node_label (token)
456
"20160424: modified so that the terms make sense, but inconsistent with the bnf.
457
this code considers the label to be just the string following the '_:'.
460
(multiple-value-bind (start end)
461
(cl-ppcre:scan *blank_node_label-scanner* token)
462
(and start (= end (length token))))))
464
(defun is-blank_node (token)
465
"check for the full blank node syntax"
467
(multiple-value-bind (start end)
468
(cl-ppcre:scan *blank_node-scanner* token)
469
(and start (= end (length token))))))
471
(defparameter *pn_local-scanner*
472
(cl-ppcre:create-scanner `(:sequence :start-anchor pn_local)))
474
(defun is-pn_local (token)
476
(multiple-value-bind (start end)
477
(cl-ppcre:scan *pn_local-scanner* token)
478
(and start (= end (length token))))))
480
(defparameter *pname_ln-scanner*
481
(cl-ppcre:create-scanner `(:sequence :start-anchor pname_ns pn_local)))
483
(defun is-pname_ln (token)
484
(and (stringp token) (not (gethash token *literal-string-tokens*))
485
(is-pname_ln-name token)))
487
(defun is-pname_ln-name (token)
488
(multiple-value-bind (start end)
489
(cl-ppcre:scan *pname_ln-scanner* token)
490
(and start (= end (length token)))))
492
(defparameter *pname_ns-scanner*
493
(cl-ppcre:create-scanner '(:sequence :start-anchor pname_ns)))
495
(defun is-pname_ns (token)
496
(and (and (stringp token) (not (gethash token *literal-string-tokens*)))
497
(multiple-value-bind (start end)
498
(cl-ppcre:scan *pname_ns-scanner* token)
499
(and start (= end (length token))))))
501
(defun is-STRING_LITERAL_LONG1 (token)
502
(and (stringp token) (gethash token *literal-string-tokens*) t))
503
(defun is-STRING_LITERAL_LONG2 (token)
504
(and (stringp token) (gethash token *literal-string-tokens*) t))
505
(defun is-STRING_LITERAL1 (token)
506
(and (stringp token) (gethash token *literal-string-tokens*) t))
507
(defun is-STRING_LITERAL2 (token)
508
(and (stringp token) (gethash token *literal-string-tokens*) t))
509
;;; (untrace is-STRING_LITERAL_LONG1 is-STRING_LITERAL_LONG2 is-STRING_LITERAL1 is-STRING_LITERAL2)
511
#+(or)(defun is-ARITHMETIC_AGGREGATE_OPERATOR (token)
512
(and (or (stringp token) (symbolp token))
513
(member token '(AVG CORR MAX MIN SAMPLE SUM STD) :test #'string-equal)))
514
(defun is-ARITHMETIC_AGGREGATE_OPERATOR (token)
516
(member token '(AVG CORR MAX MIN SAMPLE SUM STD) :test #'string-equal)))
518
(defparameter *general-scanners*
519
(list *pname_ln-scanner* *pname_ns-scanner* *pn_local-scanner* ;; *blank_node_label-scanner*
522
*float-initial-scanner* *decimal-initial-scanner* *integer-initial-scanner*
523
*langtag-initial-scanner*))
525
(defparameter *max-input-index* 0)
527
(defun general-scanner (string start end)
528
(let ((longest-token-end 0)
530
(dolist (scanner *general-scanners*)
531
(when (and (setf token-end (nth-value 1 (cl-ppcre:scan scanner string :start start :end end)))
532
(> token-end longest-token-end))
533
(setf longest-token-end token-end)))
534
(when (plusp longest-token-end)
535
(values start longest-token-end))))
538
(defparameter *format-lock* (bt:make-lock "format"))
540
(defun input-reference (index)
541
(when (< index (length atnp::*ATN-INPUT))
542
(setf *max-input-index* (max *max-input-index* index))
544
(bt:with-lock-held (*format-lock*)
545
(format *trace-output* "~a~%"
546
(list (sb-thread:thread-name sb-thread:*current-thread*) index (aref atnp::*ATN-INPUT index)
547
(cons sparql-1-0-2::*ATN-TERM* sparql-1-0-2::*ATN-STACK))))
548
(aref atnp::*ATN-INPUT index)))
550
(defun input-tokens (&optional (start 0))
551
(do* ((i start (1+ i))
552
(input (spocq.i::input-reference i) (spocq.i::input-reference i))
554
((or (null input) (> i spocq.i::*max-input-index*))
556
(push input tokens)))
558
(defun input-eof? (index)
560
(format *trace-output* "[eof ~a ~a]~%"
561
(sb-thread:thread-name sb-thread:*current-thread*) (>= index (length atnp::*ATN-INPUT)))
562
(>= index (length atnp::*ATN-INPUT)))
564
(defun tokenize-sparql (string &key (start 0) (end (length string)))
565
;; (print (map 'vector #'char-code string))
566
(let ((input (make-array 32 :fill-pointer 0 :adjustable t))
567
(byte-offsets (make-array 32 :fill-pointer 0 :adjustable t))
568
(line-offsets (make-array 32 :fill-pointer 0 :adjustable t))
574
(labels (;; (break-p (c) (find c #(#\space #\tab #\return #\linefeed #\( #\) #\{ #\} #\[ #\] #\> #\: #\; #\,)))
575
(not-ws-p (c) (not (ws-p c)))
577
((#\space #\tab #\page #\nul) t)
578
((#\return #\linefeed) (note-eol) t)
581
((#\return #\linefeed) (note-eol) t)
586
(incf pragma-bytes (+ 1 (- start line-start))))
589
(setf line-start start))
590
(add-to-input (token)
591
;; (print (list start end token))
592
(when (eq token 'spocq.s:pragma)
594
(vector-push-extend eol-count line-offsets)
595
(vector-push-extend (- start pragma-bytes) byte-offsets)
596
(vector-push-extend token input))
598
(when (> (length input) 0)
599
(aref input (1- (length input)))))
600
(extract-token (start)
601
;; (print (list :et start (previous-token)))
602
(let ((terminal-token nil)
605
;; first, iff the previous was '<' check for iri
606
;; then look for general and for keyword token match
607
;; if the general break is longer, then use it othewise the keyword match
608
;; otherwise, the string remainder
609
(cond ((and (eq (previous-token) 'spocq.s:<)
610
(setf token-end (nth-value 1 (terminated-iri-scanner string start end)))
611
(> token-end (1+ start)))
612
(values (subseq string start (1- token-end)) (1- token-end)))
613
((and (find (char string start) "?$")
614
(setf token-end (nth-value 1 (cl-ppcre:scan *var-scanner* string :start start :end end))))
615
(values (intern (subseq string (1+ start) token-end) *variable-package*) token-end))
616
((and ;; (print (list (subseq string start end) start end))
618
(find-if #'(lambda (sym &aux (name (symbol-name sym)))
619
(string-equal name string :start2 start :end2 (min (+ start (length name)) end)))
620
*terminal-punctuation*))
621
(not (and (member terminal-token '(spocq.s:- spocq.s:+))
622
(find (char string (1+ start)) ".0123456789"))))
623
(values terminal-token (+ start (length (symbol-name terminal-token)))))
626
(or (find-if #'(lambda (sym &aux (name (symbol-name sym)))
627
(string-equal name string :start2 start :end2 (min (+ start (length name)) end)))
628
*case-insensitive-keywords*)
629
(find-if #'(lambda (sym &aux (name (symbol-name sym)))
630
(string= name string :start2 start :end2 (min (+ start (length name)) end)))
631
*case-sensitive-keywords*)))
632
;; nb, with the %-escapes in qnames are not escaped, as is correct for iri lexical forms
633
;; the \-escapes are also not handled, which is not correct, but no less so than the bnf's
634
;; divergence from the iri grammar.
635
(setf general-token (let ((token-end (nth-value 1 (general-scanner string start end))))
636
(when token-end (subseq string start token-end))))
637
;; (print general-token)
640
(if (or (> (length general-token) (length (symbol-name terminal-token)))
641
(and (= (length general-token) (length (symbol-name terminal-token)))
642
;; if it's a variable, return the string rather than the interned token
643
(member (previous-token) '(spocq.s:? SPOCQ.S:|_:| SPOCQ.S:PREFIX))))
644
(values general-token (+ start (length general-token)))
645
(values terminal-token (+ start (length (symbol-name terminal-token)))))
646
(values terminal-token (+ start (length (symbol-name terminal-token)))))
648
(values general-token (+ start (length general-token)))
649
(let ((ws-p (position-if #'ws-p string :start start)))
651
(values (subseq string start ws-p) ws-p)
652
(values (subseq string start) end)))))))))
653
(decode-unicode (string)
654
(if (search "\\u" string :test #'char-equal :start2 start :end2 end)
655
(let ((decoded (make-array (- end start) :element-type 'character :fill-pointer 0))
658
(loop (when (>= to-decode end)
659
;; copy the buffer - otherwise cl-ppcre has been observed to fail a match
660
(return (values (subseq decoded 0 (length decoded)) 0 (length decoded))))
661
(cond ((eq (setf char (char string to-decode)) #\\)
662
(case (char string (1+ to-decode))
663
(#\u (vector-push-extend (unicode-char (+ 2 to-decode) 4) decoded) (incf to-decode 6))
664
(#\U (vector-push-extend (unicode-char (+ 2 to-decode) 8) decoded) (incf to-decode 10))
665
;; if it's not a unicode escape, it's some other escape. retain it.
666
(t (vector-push-extend char decoded)
667
(vector-push-extend (char string (1+ to-decode)) decoded)
668
(incf to-decode 2))))
670
(vector-push-extend char decoded)
672
(values string start end)))
673
(extract-delimiter (start)
674
(let ((char (char string start)))
675
(if (and (< (+ 1 start) end) (eql char (char string (+ 1 start))))
676
(if (and (< (+ 2 start) end) (eql char (char string (+ 2 start))))
677
(values (ecase char (#\' :|'''|) (#\" :|"""|)) (+ 3 start))
678
(values char (+ 1 start)))
679
(values char (+ 1 start)))))
680
(unicode-char (start length)
681
(code-char (parse-integer string :start start :end (+ start length) :radix 16)))
682
(extract-string (start &aux delim)
683
(multiple-value-setq (delim start)
684
(extract-delimiter start))
685
(let ((buffer (make-array 10 :adjustable t :fill-pointer 0 :element-type 'character))
688
(loop (when (>= string-end end)
689
(return (values (subseq buffer 0 (length buffer)) string-end)))
690
(setf char (aref string string-end))
695
(return (values (subseq buffer 0 (length buffer)) (1+ string-end))))
697
(cond ((and (< (+ 2 string-end) end)
698
(eql (char string (+ 1 string-end)) #\")
699
(eql (char string (+ 2 string-end)) #\"))
700
(return (values (subseq buffer 0 (length buffer)) (+ string-end 3))))
702
(vector-push-extend #\" buffer)
705
(vector-push-extend #\" buffer)
709
(#\' (return (values (subseq buffer 0 (length buffer)) (1+ string-end))))
711
(cond ((and (< (+ 2 string-end) end)
712
(eql (char string (+ 1 string-end)) #\')
713
(eql (char string (+ 2 string-end)) #\'))
714
(return (values (subseq buffer 0 (length buffer)) (+ string-end 3))))
716
(vector-push-extend #\' buffer)
719
(vector-push-extend #\' buffer)
722
(if (< (+ 1 string-end) end)
723
(ecase (char string (+ 1 string-end))
724
((#\linefeed #\return))
725
(#\\ (vector-push-extend #\\ buffer) (incf string-end 2))
726
(#\' (vector-push-extend #\' buffer) (incf string-end 2))
727
(#\" (vector-push-extend #\" buffer) (incf string-end 2))
728
(#\b (vector-push-extend #\backspace buffer) (incf string-end 2))
729
(#\f (vector-push-extend #\page buffer) (incf string-end 2))
730
(#\n (vector-push-extend #\linefeed buffer) (incf string-end 2))
731
(#\r (vector-push-extend #\return buffer) (incf string-end 2))
732
(#\t (vector-push-extend #\tab buffer) (incf string-end 2)))
735
(vector-push-extend char buffer)
736
(incf string-end)))))))
737
(multiple-value-setq (string start end) (decode-unicode string))
738
(loop (unless (setf start (position-if #'not-ws-p string :start start))
740
(when (>= start end) (return))
741
(setf initial (char string start))
744
;; skip over a comment
745
(unless (and (setf start (position-if #'eol-p string :start start))
746
(setf start (position-if #'not-ws-p string :start start)))
749
(multiple-value-bind (token new-start) (extract-string start)
750
(add-to-input token) token
751
(setf (gethash token *literal-string-tokens*) t)
752
(setf start new-start)))
754
(let ((token-end nil))
755
;; (print (list :< token-end
756
(cond ((and (> end (1+ start)) (eql (char string (1+ start)) #\=))
757
(add-to-input 'spocq.s:<=)
759
((and (> end (1+ start)) (eql (char string (1+ start)) #\<))
760
(add-to-input 'spocq.s:<<)
762
((and (setf token-end (nth-value 1 (terminated-iri-scanner string (1+ start) end)))
763
(> token-end (1+ start)))
764
(add-to-input 'spocq.s:<)
766
(add-to-input (subseq string start (1- token-end)))
767
(setf start (1- token-end))
768
(add-to-input 'spocq.s:>)
769
(setf start token-end))
771
(add-to-input 'spocq.s:<)
774
(cond ((and (> end (1+ start)) (eql (char string (1+ start)) #\>))
775
(add-to-input 'spocq.s:>>)
777
((and (> end (1+ start)) (eql (char string (1+ start)) #\=))
778
(add-to-input 'spocq.s:>=)
781
(add-to-input 'spocq.s:>)
784
(or (multiple-value-bind (token-start token-end)
785
(cl-ppcre:scan *float-initial-scanner* string :start start)
787
(setf start token-end)
788
(add-to-input (spocq.e:double (subseq string token-start token-end)))))
789
(multiple-value-bind (token-start token-end)
790
(cl-ppcre:scan *decimal-initial-scanner* string :start start)
792
(setf start token-end)
793
(add-to-input (spocq.e:decimal (subseq string token-start token-end)))))
794
(multiple-value-bind (token-start token-end)
795
(cl-ppcre:scan *integer-initial-scanner* string :start start)
797
(setf start token-end)
798
(add-to-input (spocq.e:integer (subseq string token-start token-end)))))
799
(multiple-value-bind (token new-start) (extract-token start)
800
;; (print (list :extracted token new-start))
803
(add-to-input token))
805
;; ((is-integer-string token) (add-to-input (spocq.e:integer token)))
806
;; ((is-decimal-string token) (add-to-input (spocq.e:decimal token)))
807
;; ((is-float-string token) (add-to-input (spocq.e:double token)))
809
(add-to-input token)))
810
(setf start new-start))))))
811
(values input byte-offsets line-offsets string))))
814
(defun untokenize-sparql (tokens)
815
(with-output-to-string (stream)
816
(loop (let ((token (pop tokens)))
820
(cond ((and (stringp (first tokens))
821
(eql 'spocq.s:> (second tokens)))
822
(format stream "<~a>" (first tokens))
823
(pop tokens) (pop tokens))
825
(write-string "<" stream))))
828
(symbol (if (eq (symbol-package token) *variable-package*)
829
(format stream "?~a" token)
830
(write-string (symbol-name token) stream)))
831
(string (let* ((colon-pos (position #\: token))
832
(namespace (and colon-pos
833
(rest (assoc (subseq token 0 colon-pos)
834
(namespace-bindings) :test #'string-equal)))))
836
(format stream "<~a~a>" namespace (subseq token (1+ colon-pos)))
837
(format stream "'~a'" token))))
839
(format stream "~a" token))))))
840
(write-char #\space stream))))
842
(defparameter *sparql-query-description-scanner*
843
(cl-ppcre:create-scanner `(:sequence :start-anchor
844
(:register (:greedy-repetition
846
(:sequence (:greedy-repetition 1 nil #\#)
847
(:greedy-repetition 0 nil :whitespace-char-class)
848
(:greedy-repetition 0 nil (:inverted-char-class #\linefeed #\return))
849
(:greedy-repetition 0 nil (:alternation #\linefeed #\return))
851
"Match an initial comment in a sparql query string")
853
(defgeneric sparql-query-description (sparql)
854
(:documentation "Return an initial comment from a sparql query string as its description")
855
(:method ((text string))
856
(multiple-value-bind (success matches)
857
(cl-ppcre:scan-to-strings *sparql-query-description-scanner* text)
858
(when success (string-right-trim #(#\linefeed #\return) (remove #\# (aref matches 0))))))
859
(:method ((view view))
860
(sparql-query-description (or (view-query view) ""))))
862
(defparameter *sparql-query-parameter-scanner*
863
(cl-ppcre:create-scanner `(:sequence #\$
864
(:alternation pn_chars_u (:char-class (:range #\0 #\9)))
865
(:greedy-repetition 0 nil
866
(:alternation pn_chars_u
867
(:char-class (:range #\0 #\9) ,(code-char #x00B7)
868
(:range ,(code-char #x0300) ,(code-char #x036F)) (:range ,(code-char #x203F) ,(code-char #x2040)))))))
869
"Match a variable with an initial '$' in a sparql query string")
871
(defgeneric sparql-query-parameters (sparql)
872
(:documentation "return a list of the variables indicated as $<name>")
873
(:method ((text string))
874
(loop for var-token in (remove-duplicates (cl-ppcre:all-matches-as-strings *sparql-query-parameter-scanner* text) :test #'string=)
875
collect (intern (subseq var-token 1) :?))))
877
;;; (sparql-query-parameters "select $asdf ?quer $ty where { ?quer $asdf 1 : ?quer $ty 2}")