Coverage report: /development/source/library/org/datagraph/spocq-shard/src/core/encoding/terms/turtle-shard-term.lisp
| Kind | Covered | All | % |
| expression | 339 | 429 | 79.0 |
| branch | 23 | 36 | 63.9 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: lisp; Syntax: ansi-common-lisp; Base: 10; Package: org.datagraph.spocq.implementation; -*-
2
;;; (load "patches/turtle-shard-term.lisp")
4
;;; replace the turtle field encoding operator with one which retrieves terms directly from lmdb.
6
(in-package :org.datagraph.spocq.implementation)
8
(defun write-turtle-char (char stream)
10
(#\" (write-string "\\\"" stream))
11
(#\\ (write-string "\\\\" stream))
12
(#\backspace (write-string "\\b" stream))
13
(#\page (write-string "\\f" stream))
14
(#\linefeed (write-string "\\n" stream))
15
(#\return (write-string "\\r" stream))
16
(#\tab (write-string "\\t" stream))
17
(t (let ((code (char-code char)))
18
(if (<= code #x1f) ; C0 control : http://en.wikipedia.org/wiki/Control_character
19
(format stream "\\u~4,'0x" code)
20
(write-char char stream)))))
23
(defun %write-turtle-string (%string stream byte-count)
24
"Emit an external character data string to an utf-8 encoded stream with turtle escaping.
25
This escapes the string terminator, whitespace-format, and control characters.
26
All else is passed to the stream for utf-8 encoding."
27
(unless byte-count (setf byte-count most-positive-fixnum)) ;; presume null-terminated
28
(let ((dsu:*utf8-iso8859-allowed* t)
29
(dsu:*utf8-surrogates-allowed* t)
30
(decoder (load-time-value (dsu:content-encoding-byte-decoder (dsu:content-encoding :utf-8))))
32
(declare (type (function (function cffi:foreign-pointer) (or character null)) decoder)
33
(type cffi:foreign-pointer %string)
34
(type fixnum byte-count)
35
(optimize (speed 3) (safety 0)))
36
(flet ((get-byte (%string)
37
(declare (type cffi:foreign-pointer %string))
38
(when (< i byte-count)
39
(let ((byte (cffi:mem-aref %string :uint8 i)))
41
(when (plusp byte) byte)))))
42
(loop for char = (funcall decoder #'get-byte %string)
44
do (write-turtle-char char stream)))))
46
(defun write-turtle-string-id (string-id stream)
47
"Given an rdf db string id, retrieve the external data and write it
48
to the stream as turtle in utf8."
49
(flet ((%write-string-data (k raw-value)
51
(let* ((size (rlmdb.i::%mdb-val-size raw-value))
52
(%data (rlmdb.i::%mdb-val-data raw-value)))
53
(%write-turtle-string %data stream size)
55
(declare (dynamic-extent #'%write-string-data))
56
(rlmdb::call-with-shard-string string-id #'%write-string-data)))
57
;;; (format *trace-output* "[~/format-turtle-string-id/]" 1)
59
(defun cl-user::format-turtle-string-id (stream string-id &optional option arg)
60
(declare (ignore option arg))
61
(write-turtle-string-id string-id stream))
63
(defun cl-user::%format-turtle-string (stream %string &optional option arg byte-count)
64
(declare (ignore option arg))
65
(assert (integerp byte-count))
66
(%write-turtle-string %string stream byte-count))
68
;;; (loop for char across #(#\a #\1 #\\ #\" #\' #\backspace #\page #\linefeed #\return #\tab #\. #\null #\stx) do (encode-turtle-utf8-char *trace-output* char))
71
(defun write-turtle-iri-char (char stream)
72
(labels ((encode-unicode-char-escape (char)
73
(encode-unicode-charcode-escape (char-code char)))
74
(encode-unicode-charcode-escape (code)
75
(format stream "\\u~4,'0x" code))
76
(encode-urlencode-char-escape (char)
77
(encode-urlencode-charcode-escape (char-code char)))
78
(encode-urlencode-charcode-escape (code)
79
(format stream "%~2,'0x" code)))
82
(encode-urlencode-char-escape char))
83
((#\< #\> #\" #\| #\^ #\` #\\)
84
(encode-unicode-char-escape char))
85
(t (let ((code (char-code char)))
86
(if (<= code #x1f) ; C0 control : http://en.wikipedia.org/wiki/Control_character
87
(encode-unicode-charcode-escape code)
88
(write-char char stream)))))))
90
(defun %write-turtle-iri-string (%string stream byte-count)
91
"Emit an external character data string to an utf-8 encoded stream with turtle escaping.
92
This escapes the string terminator, whitespace-format, and control characters.
93
All else is passed to the stream for utf-8 encoding.
94
Perform lax unicode decoding"
95
(unless byte-count (setf byte-count most-positive-fixnum))
96
(let ((dsu:*utf8-iso8859-allowed* t)
97
(dsu:*utf8-surrogates-allowed* t)
98
(decoder (load-time-value (dsu:content-encoding-byte-decoder (dsu:content-encoding :utf-8))))
100
(declare (type (function (function cffi:foreign-pointer) (or character null)) decoder)
101
(type cffi:foreign-pointer %string)
102
(type fixnum byte-count)
103
(optimize (speed 3) (safety 0)))
104
(flet ((get-byte (%string)
105
(declare (type cffi:foreign-pointer %string))
106
(when (< i byte-count)
107
(let ((byte (cffi:mem-aref %string :uint8 i)))
109
(when (plusp byte) byte)))))
110
(loop for char = (funcall decoder #'get-byte %string)
112
do (write-turtle-iri-char char stream)))))
115
(defun write-turtle-iri-string-id (string-id stream)
116
(flet ((%write-maybe-null-terminated-string (k raw-value)
118
(let* ((size (rlmdb.i::%mdb-val-size raw-value))
119
(%data (rlmdb.i::%mdb-val-data raw-value))
120
(terminated-size (if (zerop (cffi:mem-aref %data :uint8 (1- size))) (1- size) size)))
121
(%write-turtle-iri-string %data stream terminated-size)
123
(declare (dynamic-extent #'%write-maybe-null-terminated-string))
124
(rlmdb::call-with-shard-string string-id #'%write-maybe-null-terminated-string)))
127
(defun cl-user::format-turtle-iri-string-id (stream string-id &optional option arg)
128
(declare (ignore option arg))
129
(write-turtle-iri-string-id string-id stream))
131
(defun cl-user::%format-turtle-iri-string (stream %string &optional option arg byte-count)
132
(declare (ignore option arg))
133
(assert (integerp byte-count))
134
(%write-turtle-iri-string %string stream byte-count))
136
(defparameter *string-datatype-string-id* nil)
137
(defun string-datatype-string-id ()
138
(or *string-datatype-string-id*
139
(setf *string-datatype-string-id* (get-string-datatype-string-id))))
140
(defun get-string-datatype-string-id ()
141
(rlmdb::string-dictionary-get (symbol-uri-namestring |xsd|:|string|)))
143
(defparameter *is-string-datatype-registry* (make-registry :test 'equal))
145
(defun is-string-datatype-string-id (string-id)
146
(multiple-value-bind (is is-known) (gethash string-id *is-string-datatype-registry*)
149
(with-locked-registry (*is-string-datatype-registry*)
150
;; under race, this will update the table twice...
151
(setf (gethash string-id *is-string-datatype-registry*)
152
(equal (symbol-uri-namestring |xsd|:|string|)
153
(rlmdb::string-dictionary-get string-id)))))))
157
(defun encode-turtle-term-number (term-number stream)
158
"Given a term number and a stream, encode the stored term properties
159
(type, string, immediate value) to the given stream."
160
(declare (type fixnum term-number))
161
(let* ((%term (rlmdb:shard-term-fetch term-number))
162
(term-type (rlmdb:%shard-term-type %term))
163
(%term-data (rlmdb:%shard-term-data %term)))
164
(declare (type cffi:foreign-pointer %term)
165
(type symbol term-type)
166
(type cffi:foreign-pointer %term-data))
169
(ecase (rlmdb:%shard-term-subtype-node-subtype %term)
171
(format stream "_:~@[~a_~]genid~d"
173
(rlmdb:%shard-term-data-node-genid %term-data)))
175
(format stream "_:~@[~a_~]~V/%format-turtle-string/~d"
177
;; need a byte-count limited version of stream-write-turtle-external-utf8-string
178
4 (rlmdb:%shard-term-data-node-gensym-prefix %term-data)
179
(rlmdb:%shard-term-data-node-gensym-suffix %term-data)))
181
(format stream "_:~@[~a_~]~V/%format-turtle-string/"
183
8 (rlmdb:%shard-term-data-node-label %term-data)))
185
(format stream "_:~@[~a_~]~/format-turtle-string-id/"
187
(rlmdb:%shard-term-data-node-label-offset %term-data)))))
189
(format stream "<~/format-turtle-iri-string-id/>"
190
(rlmdb:%shard-term-data-uri-string-offset %term-data)))
192
(ecase (rlmdb:%shard-term-subtype-string-subtype %term)
194
(format stream "\"~V/%format-turtle-string/\""
197
(let* ((string-offset (rlmdb:%shard-term-data-string-value-offset %term-data))
198
(language-offset (rlmdb:%shard-term-data-string-language-offset %term-data)))
199
(if (plusp language-offset)
200
(format stream "\"~/format-turtle-string-id/\"@~/format-turtle-string-id/" string-offset language-offset)
201
(format stream "\"~/format-turtle-string-id/\"" string-offset))))))
203
(let ((lexical-form-id (rlmdb:%shard-term-data-literal-value-offset %term-data))
204
(datatype-string-id (rlmdb:%shard-term-data-literal-datatype-offset %term-data)))
205
(if (is-string-datatype-string-id datatype-string-id)
206
(format stream "\"~/format-turtle-string-id/\""
208
(format stream "\"~/format-turtle-string-id/\"^^<~/format-turtle-iri-string-id/>"
209
lexical-form-id datatype-string-id))))
211
;; must return it directly to avoid anomolous appearances
212
(format stream "\"~:[false~;true~]\"^^<http://www.w3.org/2001/XMLSchema#boolean>"
213
(rlmdb:%shard-term-data-boolean %term-data)))
215
(encode-turtle-object (rlmdb:%shard-term-data-integer %term-data) stream))
217
(ecase (rlmdb:%shard-term-subtype-decimal-subtype %term)
219
(let* ((options (rlmdb:%shard-term-options %term))
220
(minusp (zerop (logand options #x01)))
221
(scale (ash options -1))
222
(value (rlmdb:%shard-term-data-integer %term-data)))
223
(when minusp (setf value (- value)))
224
(setf scale (if (zerop value) 1 (expt 10 scale)))
226
#+sbcl (sb-kernel::%make-ratio value scale)
227
#-sbcl (/ value scale))
228
(encode-turtle-object value stream)))
230
(let* ((integer (rlmdb:%shard-term-data-decimal-integer %term-data))
231
(fraction (if (zerop integer) 1 (rlmdb:%shard-term-data-decimal-fraction %term-data))))
232
(if (typep 1/3 *expand-literal-values*)
233
(format stream "\"~d.~d\"^^<http://www.w3.org/2001/XMLSchema#decimal>" integer fraction)
234
(format stream "~d.~d" integer fraction))))))
236
(encode-turtle-object (rlmdb:%shard-term-data-float %term-data) stream))
238
(encode-turtle-object (rlmdb:%shard-term-data-double %term-data) stream))
240
(encode-turtle-object (timeline-location-date-time (rlmdb:%shard-term-data-time %term-data)) stream))
242
(encode-turtle-object (timeline-location-date (rlmdb:%shard-term-data-time %term-data)) stream))
244
(encode-turtle-object (timeline-location-time (rlmdb:%shard-term-data-time %term-data)) stream))
249
;;; wrap the top-level encoding operator to establish an open string database
250
;;; replace the field encoding operator with one which reads direct from there.
252
(defmethod write-rdf-turtle :around ((results solution-generator) (stream stream))
253
(rlmdb::with-string-database (sdb)
256
(defun write-rdf-field-turtle (last-subject page stream &optional (index 0) (start 0) end)
257
(assert (= (length (array-dimensions page)) 2) ()
258
"invalid result array dimensions: ~a" (array-dimensions page))
259
(unless (typep (array-dimension page 1) '(integer 3 4)) ()
260
(spocq.e:request-error "Invalid turtle field dimensions: ~s ..." (array-dimensions page)))
261
(flet ((emit (term-number)
263
(-1 (write-string "<urn:dydra:default>" stream))
264
(-2 (write-string "<urn:dydra:named>" stream))
265
(t (encode-turtle-term-number term-number stream)))
266
(write-char #\space stream)))
267
(dotimes (page-index (array-dimension page 0))
268
(when (>= index start)
269
(when (and end (>= index end))
271
(let ((subject (aref page page-index 0))
272
(predicate (aref page page-index 1))
273
(object (aref page page-index 2)))
274
(when (not (zerop (* subject predicate object)))
276
(cond ((= last-subject subject)
277
(format stream " ;~% "))
279
(format stream " .~%")
282
(setf last-subject subject)
290
(test-sparql "describe ?s where {?s ?p ?o}" :repository-id "james/test2"
291
:response-content-type mime:text/turtle)
292
(test-sparql "describe ?s where {?s ?p ?o}" :repository-id "james/test2"
293
:response-content-type mime:application/n-quads)
295
(defun dump-term-numbers (&key (start 1) (stream *standard-output*) (term-count (rlmdb:%term-record-count)))
296
(rlmdb::with-string-database (sdb)
297
(loop for i from start below term-count
298
with spocq.i::*encode-object-subtypes* = t
299
do (block :encode-term-number
300
(handler-bind ((error (lambda (c) (cerror "continue." c) (return-from :encode-term-number c))))
301
(encode-turtle-term-number i stream)
303
(with-open-file (stream "/tmp/terms.out" :direction :output :if-exists :supersede :if-does-not-exist :create)
304
(dump-term-numbers :stream stream :limit 10000))
305
(with-open-file (stream "/tmp/terms.out" :direction :output :if-exists :supersede :if-does-not-exist :create)
306
(dump-term-numbers :stream stream :start 13525000))
308
;; time 490s w/ declarations 517 w-o
309
(with-open-file (stream "/tmp/terms.out" :direction :output :if-exists :supersede :if-does-not-exist :create)
310
(dump-term-numbers :stream stream))
311
(rlmdb::with-string-database (sdb) (encode-turtle-term-number 114161370 *trace-output*))
312
(write-turtle-string-id 116503701 *trace-output*)
314
(%write-turtle-string #.(SB-SYS:INT-SAP #X7CB816BE4C04) *trace-output* 96)