diff --git a/doc/tech_report.tex b/doc/tech_report.tex new file mode 100644 index 0000000000000000000000000000000000000000..47dcf5b2df011de82c96cbd0bb8b34cc21c30dbd --- /dev/null +++ b/doc/tech_report.tex @@ -0,0 +1,314 @@ +% *** en embryo of a technical report describing the labcomm design rationale and implementation *** + +\documentclass[a4paper]{article} +%\usepackage{verbatims} +%\usepackage{todo} + +\begin{document} +\title{Labcomm tech report} +\author{Anders Blomdell and Sven Gesteg\aa{}rd Robertz } +\date{embryo of draft, \today} + +\maketitle + +\begin{abstract} + +LabComm is a binary protocol suitable for transmitting and storing samples of +process data. It is self-describing and independent of programming language, +processor, and network used (e.g., byte order, etc). It is primarily intended +for situations where the overhead of communication has to be kept at a minimum, +hence LabComm only requires one-way communication to operate. The one-way +operation also has the added benefit of making LabComm suitable as a storage +format. + +LabComm provides self-describing channels, as communication starts with the +transmission of an encoded description of all possible sample types that can +occur, followed by any number of actual samples in any order the sending +application sees fit. + +The LabComm system is based on a binary protocol and +and a compiler that generates encoder/decoder routines for popular languages +including C, Java, and Python. + +The LabComm compiler accepts type and sample declarations in a small language +that is similar to C or Java type-declarations. +\end{abstract} +\section{Introduction} + +%[[http://rfc.net/rfc1057.html|Sun RPC]] +%[[http://asn1.org|ASN1]]. + +LabComm has got it's inspiration from Sun RPC~\cite{SunRPC} +and ASN1~\cite{ANS1}. LabComm is primarily intended for situations +where the overhead of communication has to be kept at a minimum, hence LabComm +only requires one-way communication to operate. The one-way operation also has +the added benefit of making LabComm suitable as a storage format. + +\section{Communication model} + +LabComm provides self-describing communication channels, by always transmitting +a machine readable description of the data before actual data is sent. +Therefore, communication on a LabComm channel has two phases + +\begin{enumerate} +\item the transmission of signatures (an encoded description including data +types and names, see appendix~\ref{sec:ProtocolGrammar} for details) for all sample types +that can be sent on the channel +\item the transmission of any number of actual samples in any order +\end{enumerate} + +During operation, LabComm will ensure (i.e., monitor) that a communication +channel is fully configured, meaning that both ends agree on what messages that +may be passed over that channel. If an unregistered sample type is sent or +received, the LabComm encoder or decoder will detect it and take action. + +The roles in setting up, and maintaining, the configuration of a channel are as follows: + +\paragraph{The application software} (or higher-level protocol) is required to + +\begin{itemize} +\item register all samples to be sent on a channel with the encoder +\item register handlers for all samples to be received on a channel with the decoder +\end{itemize} + +\paragraph{The transmitter (encoder)} + +\begin{itemize} + \item ensures that the signature of a sample is transmitted on the channel before samples are + written to that channel +\end{itemize} + +\paragraph{The receiver (decoder)} + +\begin{itemize} + \item checks, for each signature, that the application has registered a handler for that sample type + \item if an unhandled signature is received, pauses the channel and informs the application +\end{itemize} + +\section{The Labcomm language} + +The following examples do not cover the entire language +specification (see appendix~\ref{language_grammar}), but might serve as a +gentle introduction to the LabComm language. + +\subsection{Primitive types} + +\begin{verbatim} + sample boolean a_boolean; + sample byte a_byte; + sample short a_short; + sample int an_int; + sample long a_long; + sample float a_float; + sample double a_double; + sample string a_string; +\end{verbatim} + +\subsection{Arrays} + +\begin{verbatim} + sample int fixed_array[3]; + sample int variable_array[_]; // Note 1 + sample int fixed_array_of_array[3][4]; // Note 2 + sample int fixed_rectangular_array[3, 4]; // Note 2 + sample int variable_array_of_array[_][_]; // Notes 1 & 2 + sample int variable_rectangular_array[_, _]; // Notes 1 & 2 +\end{verbatim} + +\begin{enumerate} +\item In contrast to C, LabComm supports both fixed and variable (denoted +by \verb+_+) sized arrays. + +\item In contrast to Java, LabComm supports multidimensional arrays and not +only arrays of arrays. + +\end{enumerate} + +\subsection{Structures} + +\begin{verbatim} + sample struct { + int an_int_field; + double a_double_field; + } a_struct; +\end{verbatim} + +\section{User defined types} + +\begin{verbatim} + typedef struct { + int field_1; + byte field_2; + } user_type[_]; + sample user_type a_user_type_instance; + sample user_type another_user_type_instance; +\end{verbatim} + +\section{User actions} + +User actions (similar to ioctl()) allowing the application or a higher level +protocol to communicate with the underlying transport layer through the LabComm +encoder A special case of this is a specific action informing the underlying +transport that a signature is being sent (to allow handshaking) + + +\section{LabComm is not...} + +\begin{itemize} +\item a protocol for two-way connections +\item intrinsically supporting reliable communication +\item providing semantic service-descriptions +\end{itemize} + +But + +\begin{itemize} +\item it is suitable for the individual channels of a structured connection +\item the user action mechanism allows using feature of different transport layers + through labcomm (i.e., it allows encapsulation of the transport layer) +\item the names of samples can be chosen and mapped according to a suitable taxonomy or ontology +\end{itemize} + + + +\section{Example and its encoding} + +With the following `example.lc` file: + +\begin{verbatim} +sample struct { + int sequence; + struct { + boolean last; + string data; + } line[_]; +} log_message; +sample float data; +\end{verbatim} + +and this \verb+example_encoder.c+ file + +\begin{verbatim} +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <labcomm_fd_reader.h> +#include <labcomm_fd_writer.h> +#include "example.h" + +int main(int argc, char *argv[]) { + int fd; + struct labcomm_encoder *encoder; + int i, j; + + fd = open("example.encoded", O_WRONLY|O_CREAT|O_TRUNC, 0644); + encoder = labcomm_encoder_new(labcomm_fd_writer, &fd); + labcomm_encoder_register_example_log_message(encoder); + labcomm_encoder_register_example_data(encoder); + for (i = 0 ; i < argc ; i++) { + example_log_message message; + + message.sequence = i + 1; + message.line.n_0 = i; + message.line.a = malloc(message.line.n_0*sizeof(message.line)); + for (j = 0 ; j < i ; j++) { + message.line.a[j].last = (j == message.line.n_0 - 1); + message.line.a[j].data = argv[j + 1]; + } + labcomm_encode_example_log_message(encoder, &message); + free(message.line.a); + } + for (i = 0 ; i < argc ; i++) { + float f = i; + labcomm_encode_example_data(encoder, &f); + } +} +\end{verbatim} + +Running \verb+./example_encoder one two+, will yield the following result in example.encoded: + + +\begin{verbatim} +00000000 02 40 0b 6c 6f 67 5f 6d 65 73 73 61 67 65 11 02 |.@.log_message..| +00000010 08 73 65 71 75 65 6e 63 65 23 04 6c 69 6e 65 10 |.sequence#.line.| +00000020 01 00 11 02 04 6c 61 73 74 20 04 64 61 74 61 27 |.....last .data'| +00000030 02 41 04 64 61 74 61 25 40 00 00 00 01 00 40 00 |.A.data%@.....@.| +00000040 00 00 02 01 01 03 6f 6e 65 40 00 00 00 03 02 00 |......one@......| +00000050 03 6f 6e 65 01 03 74 77 6f 41 00 00 00 00 41 3f |.one..twoA....A?| +00000060 80 00 00 41 40 00 00 00 |...A@...| +00000068 +\end{verbatim} + + +\section{Ideas/Discussion}: + +The labcomm language is more expressive than its target languages regarding data types. +E.g., labcomm can declare both arrays of arrays and matries where Java only has arrays of arrays +In the generated Java code, a labcomm matrix is implemented as an array of arrays. + +Another case (not yet included) is unsigned types, which Java doesn't have. If we include +unsigned long in labcomm, that has a larger range of values than is possible to express using +Java primitive types. However, it is unlikely that the entire range is actually used, so one +way of supporting the common cases is to include run-time checks for overflow in the Java encoders +and decoders. + +\appendix + +\section{The LabComm protocol} +\label{sec:ProtocolGrammar} + +\begin{verbatim} +<packet> := ( <type_decl> | <sample_decl> | <sample_data> )* +<type_decl> := 0x01 ''(packed)'' <user_id> <string> <type> +<sample_decl> := 0x02 ''(packed)''<user_id> <string> <type> +<user_id> := 0x60..0xffffffff ''(packed)'' +<string> := <string_length> <char>* +<string_length> := 0x00..0xffffffff ''(packed)'' +<char> := any UTF-8 char +<type> := ( <basic_type> | <user_id> | <array_decl> | <struct_decl> ) +<basic_type> := ( <boolean_type> | <byte_type> | <short_type> | + <integer_type> | <long_type> | <float_type> | + <double_type> | <string_type> ) +<boolean_type> := 0x20 ''(packed)'' +<byte_type> := 0x21 ''(packed)'' +<short_type> := 0x22 ''(packed)'' +<integer_type> := 0x23 ''(packed)'' +<long_type> := 0x24 ''(packed)'' +<float_type> := 0x25 ''(packed)'' +<double_type> := 0x26 ''(packed)'' +<string_type> := 0x27 ''(packed)'' +<array_decl> := 0x10 ''(packed)'' <number_of_indices> <indices> <type> +<number_of_indices> := 0x00..0xffffffff ''(packed)'' +<indices> := ( <variable_index> | <fixed_index> )* +<variable_index> := 0x00 ''(packed)'' +<fixed_index> := 0x01..0xffffffff ''(packed)'' +<struct_decl> := 0x11 ''(packed)'' <number_of_fields> <field>* +<number_of_fields> := 0x00..0xffffffff ''(packed)'' +<field> := <string> <type> +<sample_data> := <user_id> <packed_sample_data> +<packed_sample_data> := is sent in network order, sizes are as follows: + +||Type ||Encoding/Size || +||---------------||------------------------------------------------------|| +||boolean || 8 bits || +||byte || 8 bits || +||short || 16 bits || +||integer || 32 bits || +||long || 64 bits || +||float || 32 bits || +||double || 64 bits || +||string || length ''(packed)'', followed by UTF8 encoded string || +||array || each variable index ''(packed)'', || +|| || followed by encoded elements || +||struct || concatenation of encoding of each element || +|| || in declaration order || +\end{verbatim} + +Type fields, user IDs, number of indices and lengths are sent in a packed, or +variable length, form. An integer is sent as a sequence of bytes where the +lower seven bits contain a chunk of the actual number and the high bit +indicates if more chunks follow. The sequence of chunks are sent with the least +significant chunk first. (The numbers encoded in this form are indicated above +with \textit{(packed)}.) + +\end{document}