Compare revisions

d3921882 · d3921882 · e2c2b9d4 · d3921882 · d3921882 · d3921882
--- a/compiler/tools/beaver-cc.jar
+++ b/compiler/tools/beaver-cc.jar
--- a/compiler/tools/beaver-rt.jar
+++ b/compiler/tools/beaver-rt.jar
--- a/compiler/tools/beaver.jar
+++ b/compiler/tools/beaver.jar
--- a/doc/.gitignore
+++ b/doc/.gitignore
+tech_report.aux
+tech_report.bbl
+tech_report.blg
+tech_report.fdb_latexmk
+tech_report.fls
+tech_report.log
+tech_report.pdf
--- a/doc/refs.bib
+++ b/doc/refs.bib
+@TechReport{SunRPC,
+  author =    {},
+  title =     {RFC 1057: Remote Procdeure Call Protocol Specification, Version 2},
+  institution =  {Sun Microsystems},
+  year =         {1988},
+  OPTkey =       {},
+  OPTtype =      {},
+  OPTnumber =    {},
+  OPTaddress =   {},
+  OPTmonth =     {},
+  OPTnote =      {},
+  OPTannote =    {}
+}
+@Misc{ASN1,
+  author =       {ITU-T},
+  title =        {ASN.1 project web page},
+  howpublished  ={\verb+http://www.itu.int/en/ITU-T/asn1/+},
+  OPTnote =    {}
+}
+ 
+@article{jastadd,
+ author = {Ekman, Torbj\"{o}rn and Hedin, G\"{o}rel},
+ title = {The Jastadd Extensible Java Compiler},
+ journal = {SIGPLAN Not.},
+ issue_date = {October 2007},
+ volume = {42},
+ number = {10},
+ month = oct,
+ year = {2007},
+ issn = {0362-1340},
+ pages = {1--18},
+ numpages = {18},
+ url = {http://doi.acm.org/10.1145/1297105.1297029},
+ doi = {10.1145/1297105.1297029},
+ acmid = {1297029},
+ publisher = {ACM},
+ address = {New York, NY, USA},
+ keywords = {OOP, compilers, declarative frameworks, extensibility, java, modularity},
+}
--- a/doc/tech_report.tex
+++ b/doc/tech_report.tex
+% *** en embryo of a technical report describing the labcomm design rationale and implementation ***
+
+\documentclass[a4paper]{article}
+\usepackage{listings}
+%\usepackage{verbatims}
+%\usepackage{todo}
+
+\begin{document}
+\title{Labcomm tech report}
+\author{Anders Blomdell and Sven Gesteg\aa{}rd Robertz }
+\date{draft, \today}
+
+\maketitle
+
+\begin{abstract}
+
+LabComm is a binary protocol suitable for transmitting and storing samples of
+process data. It is self-describing and independent of programming language,
+processor, and network used (e.g., byte order, etc).  It is primarily intended
+for situations where the overhead of communication has to be kept at a minimum,
+hence LabComm only requires one-way communication to operate. The one-way
+operation also has the added benefit of making LabComm suitable as a storage
+format.
+
+LabComm provides self-describing channels, as communication starts with the
+transmission of an encoded description of all possible sample types that can
+occur, followed by any number of actual samples in any order the sending
+application sees fit.
+
+The LabComm system is based on a binary protocol and
+and a compiler that generates encoder/decoder routines for popular languages
+including C, Java, and Python. There is also a standard library for the
+languages supported by the compiler, containing generic routines for
+encoding and decoding types and samples, and interaction with
+application code.
+
+The LabComm compiler accepts type and sample declarations in a small language
+that is similar to C or Java type-declarations.
+\end{abstract}
+\section{Introduction}
+
+%[[http://rfc.net/rfc1057.html|Sun RPC]]
+%[[http://asn1.org|ASN1]].
+
+LabComm has got it's inspiration from Sun RPC~\cite{SunRPC}
+and ASN1~\cite{ASN1}. LabComm is primarily intended for situations
+where the overhead of communication has to be kept at a minimum, hence LabComm
+only requires one-way communication to operate. The one-way operation also has
+the added benefit of making LabComm suitable as a storage format.
+
+Two-way comminication adds complexity, in particular for hand-shaking
+during establishment of connections, and the LabComm library provides
+support for (for instance) avoiding deadlocks during such hand-shaking.
+
+\pagebreak
+\section{Communication model}
+
+LabComm provides self-describing communication channels, by always transmitting
+a machine readable description of the data before actual data is
+sent\footnote{Sometimes referred to as \emph{in-band} self-describing}.
+Therefore, communication on a LabComm channel has two phases
+
+\begin{enumerate}
+\item the transmission of signatures (an encoded description including data
+types and names, see appendix~\ref{sec:ProtocolGrammar} for details) for all sample types
+that can be sent on the channel
+\item the transmission of any number of actual samples in any order
+\end{enumerate}
+
+During operation, LabComm will ensure (i.e., monitor) that a communication
+channel is fully configured, meaning that both ends agree on what messages that
+may be passed over that channel.  If an unregistered sample type is sent or
+received, the LabComm encoder or decoder will detect it and take action.
+In more dynamic applications, it is possible to reconfigure a channel in order to add,
+remove, or change the set of registered sample types. This is discussed
+in Section~\ref{sec:reconfig}.
+
+The roles in setting up, and maintaining, the configuration of a channel are as follows:
+
+\paragraph{The application software} (or higher-level protocol) is required to
+
+\begin{itemize}
+\item register all samples to be sent on a channel with the encoder
+\item register handlers for all samples to be received  on a channel with the decoder
+\end{itemize}
+
+\paragraph{The transmitter (encoder)}
+
+\begin{itemize}
+ \item ensures that the signature of a sample is transmitted on the channel before samples are
+       written to that channel
+\end{itemize}
+
+\paragraph{The receiver (decoder)}
+
+\begin{itemize}
+ \item checks, for each signature, that the application has registered a handler for that sample type
+ \item if an unhandled signature is received, pauses the channel and informs the application
+\end{itemize}
+
+The fundamental communication model applies to all LabComm channels and
+deals with the individual unidirectional channels. In addition to that,
+the labcomm libraries support the implementation of higher-level
+handshaking and establishment of bidirectional channels both through
+means of interacting with the underlying transport layer (e.g., for
+marking packets containing signatures as \emph{important}, for
+transports that handle resending of dropped packets selectively), or
+requesting retransmission of signatures.
+
+In order to be both lean and generic, LabComm does not provide a
+complete protocol for establishing and negotiating bidirectional
+channels, but does provide support for building such protocols on top
+of LabComm.
+\subsection{Reconfiguration}
+\label{sec:reconfig}
+
+The fundamental communication model can be generalized to the life-cycle
+of a concrete communication channel, including the transport layer,
+between two end-points. Then, the communication phases are
+\begin{enumerate}
+  \item \emph{Establishment} of communication channel at the transport layer
+  \item \emph{Configuration} of the LabComm channel (registration of sample
+    types)
+  \item \emph{Operation} (transmission of samples)
+\end{enumerate}
+where it is possible to \emph{reconfigure} a channel by transitioning
+back from phase 3 to phase 2. That allows dynamic behaviour, as a sample
+type can be added or redefined at run-time. It also facilitates error
+handling in two-way channels.
+
+One example of this, more dynamic, view of a labcomm channel is that the
+action taken when an unregistered sample is sent or received is to
+revert back to the configuration phase and redo the handshaking to
+ensure that both sides agree on the set of sample types (i.e.,
+signatures) that are currently configured for the channel.
+
+From the system perspective, the LabComm protocol is involved in
+phases 2 and 3. The establishement of the \emph{transport-layer}
+channels is left to external application code. In the Labcomm library,
+that application code is connected to the LabComm routines through
+the \emph{reader} and \emph{writer} interfaces,
+with default implementations for sockets or file descriptors (i.e.,
+files and streams).
+
+
+\section{The Labcomm language}
+
+The LabComm language is used to describe data types, and from such data
+descriptions, the compiler generates code for encoding and decoding
+samples. The language is quite similar to C struct declarations, with
+some exceptions. We will now introduce the language through a set of
+examples.
+
+These examples do not cover the entire language
+specification (see appendix~\ref{sec:LanguageGrammar} for the complete
+grammar), but serve as a gentle introduction to the LabComm
+language covering most common use-cases.
+
+\subsection{Primitive types}
+
+\begin{verbatim}
+  sample boolean a_boolean;
+  sample byte a_byte;
+  sample short a_short;
+  sample int an_int;
+  sample long a_long;
+  sample float a_float;
+  sample double a_double;
+  sample string a_string;
+\end{verbatim}
+
+\subsection{The void type}
+
+There is a type, \verb+void+, which can be used to send
+a sample that contains no data. 
+
+\begin{verbatim}
+typedef void an_empty_type;
+
+sample an_empty_type no_data1;
+sample void no_data2;
+\end{verbatim}
+
+\verb+void+ type can may not be used as a field in a struct or
+the element type of an array.
+
+
+\subsection{Arrays}
+
+\begin{verbatim}
+  sample int fixed_array[3];
+  sample int variable_array[_];                // Note 1
+  sample int fixed_array_of_array[3][4];       // Note 2
+  sample int fixed_rectangular_array[3, 4];    // Note 2
+  sample int variable_array_of_array[_][_];    // Notes 1 & 2
+  sample int variable_rectangular_array[_, _]; // Notes 1 & 2
+\end{verbatim}
+
+\begin{enumerate}
+\item In contrast to C, LabComm supports both fixed and variable (denoted
+by~\verb+_+) sized arrays.
+
+\item In contrast to Java, LabComm supports multidimensional arrays and not
+only arrays of arrays.
+
+\end{enumerate}
+
+\subsection{Structures}
+
+\begin{verbatim}
+  sample struct {
+    int an_int_field;
+    double a_double_field;
+  } a_struct;
+\end{verbatim}
+
+\subsection{Sample type refereces}
+
+In addition to the primitive types, a sample may contain
+a reference to a sample type. References are declared using
+the \verb+sample+ keyword.
+
+Examples:
+
+\begin{verbatim}
+sample sample a_ref;
+
+sample sample ref_list[4];
+
+sample struct { 
+    sample ref1;
+    sample ref2;
+    int x;
+    int y;
+} refs_and_ints;
+\end{verbatim}
+
+Sample references are need to be registered on both encoder and decoder
+side, using the functions
+
+\begin{verbatim}
+int labcomm_decoder_sample_ref_register(
+    struct labcomm_decoder *decoder\nonumber
+    const struct labcomm_signature *signature);
+
+int labcomm_encoder_sample_ref_register(
+    struct labcomm_encoder *encoder\nonumber
+    const struct labcomm_signature *signature);
+\end{verbatim}
+
+The value of an unregistered sample reference will be decoded as \verb+null+.
+
+\subsection{User defined types}
+
+User defined types are declared with the \verb+typedef+ reserved word,
+and can then be used in type and sample declarations.
+
+\begin{verbatim}
+  typedef struct {
+    int field_1;
+    byte field_2;
+  } user_type[_];
+  sample user_type a_user_type_instance;
+  sample user_type another_user_type_instance;
+\end{verbatim}
+
+\section{The LabComm system}
+
+The LabComm system consists of a compiler for generating code from the data
+descriptions, and libraries providing LabComm communication facilities in,
+currently, C, Java, Python, C\#, and RAPID\footnote{excluding variable
+size samples, as RAPID has limited support for dynamic memory allocation}.
+
+
+\subsection{The LabComm compiler}
+
+The LabComm compiler generates code for the declared samples, including marshalling and
+demarshalling code, in the supported target languages.
+
+The compiler itself is implemented in Java using the JastAdd~\cite{jastadd} compiler compiler.
+
+\subsection{The LabComm library}
+
+The LabComm libraries contain functionality for the end-to-end transmission
+of samples. They are divided into two layers, where the upper layer implements
+the general encoding and decoding of samples, and the lower one deals with
+the transmission of the encoded byte stream on a particular transport layer.
+
+Thus, the LabComm communication stack looks like this:
+\begin{figure}[h!]
+\begin{verbatim}
+    _______________________
+    |     Application     |
+    +---------------------+
+    | encoder  | decoder  |    to/from labcomm encoded byte stream
+    +----------+----------+
+    | writer   | reader   |    transmit byte stream over particular transport
+    +----------+----------+
+    | transport layer / OS|
+    +---------------------+
+\end{verbatim}
+\end{figure}
+\subsubsection{LabComm actions}
+
+(similar to ioctl())
+The encoder/writer and decoder/reader interfaces consist of a set of actions
+
+One example of this is that there is a a separate writer action for
+transmitting signatures, allowing the writer to treat a signature differently
+from encoded samples, e.g., to allow handshaking during channel setup.
+
+User actions allow the application or a higher level
+protocol to communicate with the underlying transport layer through the LabComm
+encoder.
+
+One example is reliable communication, which is controlled from the application
+but needs to be implemented for each transport at at the reader/writer level.
+(Or not, e.g., TCP)
+
+\section{LabComm is not...}
+
+\begin{itemize}
+\item a protocol for two-way connections
+\item intrinsically supporting reliable communication
+\item providing semantic service-descriptions
+\end{itemize}
+
+But
+
+\begin{itemize}
+\item it is suitable for the individual channels of a structured connection
+\item the user action mechanism allows using feature of different transport layers
+  through labcomm (i.e., it allows encapsulation of the transport layer)
+\item the names of samples can be chosen and mapped according to a suitable taxonomy or ontology
+\end{itemize}
+
+
+
+\section{Example and its encoding}
+
+With the following `example.lc` file:
+
+\lstinputlisting[basicstyle=\footnotesize\ttfamily]{../examples/wiki_example/example.lc}
+and this \verb+example_encoder.c+ file
+\lstinputlisting[basicstyle=\footnotesize\ttfamily,language=C]{../examples/wiki_example/example_encoder.c}
+
+\newpage
+
+Running \verb+./example_encoder one two+, will yield the following result in example.encoded:
+\begin{verbatim}
+00000000  01 0c 0b 4c 61 62 43 6f  6d 6d 32 30 31 34 02 30 |...LabComm2014.0|
+00000010  40 0b 6c 6f 67 5f 6d 65  73 73 61 67 65 22 11 02 |@.log_message"..|
+00000020  08 73 65 71 75 65 6e 63  65 23 04 6c 69 6e 65 10 |.sequence#.line.|
+00000030  01 00 11 02 04 6c 61 73  74 20 04 64 61 74 61 27 |.....last .data'|
+00000040  02 08 41 04 64 61 74 61  01 25 40 04 00 00 00 01 |..A.data.%@.....|
+00000050  00 40 09 00 00 00 02 01  01 03 6f 6e 65 40 0e 00 |.@........one@..|
+00000060  00 00 03 02 00 03 6f 6e  65 01 03 74 77 6f 41 04 |......one..twoA.|
+00000070  00 00 00 00 41 04 3f 80  00 00 41 04 40 00 00 00 |....A.?...A.@...|
+00000080
+\end{verbatim}
+
+i.e.,
+\begin{verbatim}
+<version> <length: 12> <string: <len: 11> <"LabComm2014">>
+<sample_decl> <length: 48 
+              <user_id: 0x40> 
+              <string: <len: 11> <"log_message">
+  <signature_length: 34>
+  <struct_decl:
+    <number_of_fields: 2>
+    <string: <len: 8> <"sequence"> <type: <integer_type>>
+    <string: <len: 4> <"line">> <type: <array_decl
+      <number_indices: 1> <variable_index>
+      <type: <struct_decl:
+        <number_of_fields:2>
+        <string: <len: 4> <"last">> <type: <boolean_type>>
+        <string: <len: 4> <"data">> <type: <string_type>>
+      >>
+   >
+>
+<sample_decl> <length: 8> <user_id: 0x41> <string: <len: 4> <"data">>
+  <signature_length: 1> <float_type>
+<sample_data> <user_id: 40> <length: 4>  <packed_sample_data>
+<sample_data> <user_id: 40> <length: 9>  <packed_sample_data>
+<sample_data> <user_id: 40> <length: 14>  <packed_sample_data>
+\end{verbatim}
+
+\section{Type and sample declarations}
+
+LabComm has two constructs for declaring sample types, \emph{sample
+declarations} and \emph{type declarations}. A sample declaration is used
+for the concrete sample types that may be transmitted, and is always
+encoded as a \emph{flattened} signature. That means that a sample
+containing user types, like
+
+\begin{verbatim}
+typedef struct {
+  int x;
+  int y;
+} point;
+
+sample struct {
+  point start;
+  point end;
+} line;
+\end{verbatim}
+
+is flattened to 
+
+\begin{verbatim}
+sample struct {
+  struct {
+    int x;
+    int y;
+  } start;
+  struct {
+    int x;
+    int y;
+  } end;
+} line;
+\end{verbatim}
+
+Sample declarations are always sent, and is the fundamental identity of
+a type in LabComm. 
+
+Type declarations is the hierarchical counterpart to sample
+declarations: here, fields of user types are encoded as a reference to
+the type instead of being flattened. As the flattened sample decl is the
+fundamental identity of a type, type declarations can be regarded as
+meta-data, describing the internal structure of a sample. They are
+intended to be read by higher-level software and human system developers
+and integrators.
+
+Sample declarations and type declarations have separate name-spaces in
+the sense that the numbers assigned to them by a labcomm encoder 
+come from two independent number series. To identify which
+\verb+TYPE_DECL+ a particular \verb+SAMPLE_DECL+ corresponds to, the
+\verb+TYPE_BINDING+ packet is used.
+
+For sample types that do not depend on any typedefs, no \verb+TYPE_DECL+
+is sent, and the \verb+TYPE_BINDING+ binds the sample id to the special
+value zero to indicate that the type definition is identical to the
+flattened signature.
+
+\subsection{Example}
+
+The labcomm declaration
+\lstinputlisting[basicstyle=\footnotesize\ttfamily]{../examples/user_types/test.lc}
+can be is encoded as
+\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
+TYPE_DECL 0x40 "coord" <int> val
+TYPE_DECL 0x41 "point" <struct> <2 fields> 
+                                "x" <type: 0x40> 
+                                "y" <type: 0x40>
+TYPE_DECL 0x42 "line" <struct> <2 fields> 
+                                "start" <type: 0x41> 
+                                "end" <type: 0x41>
+TYPE_DECL 0x43 "foo" <struct> <3 fields> 
+                                "a" <int> 
+                                "b" <int> 
+                                "c" <boolean>
+TYPE_DECL 0x44 "twolines" <struct> <3 fields> 
+                                "l1" <type:0x42> 
+                                "l2" <type:0x42> 
+                                "f" <type:0x43>
+
+SAMPLE_DECL 0x40 "twolines" <flat signature>
+
+TYPE_BINDING 0x40 0x44
+\end{lstlisting}
+
+Note that the id 0x40 is used both for the \verb+TYPE_DECL+ of
+\verb+coord+ and the \verb+SAMPLE_DECL+ of \verb+twoline+, and that the
+\verb+TYPE_BINDING+ binds the sample id \verb+0x40+ to the type id
+\verb+0x44+.
+
+\subsection{Run-time behaviour}
+
+When a sample type is registered on an encoder, a \verb+SAMPLE_DECL+
+(i.e., the flat signature) is always generated on that encoder channel.
+
+If the sample depends on user types (i.e., typedefs), \verb+TYPE_DECL+
+packets are encoded, recursively, for the dependent types and a 
+corresponding \verb+TYPE_BINDING+ is encoded.
+
+If a \verb+TYPE_DECL+ is included via multiple sample types, or
+dependency paths, an encoder may choose to only encode it once, but is
+not required to do so. However, if multiple \verb+TYPE_DECL+ packets are
+sent for the same \verb+typedef+, the encoder must use the same
+\verb+type_id+.
+
+\subsection{Decoding in-band type descriptions}
+
+In LabComm, the in-band data descriptions are equivalent to \footnote{in
+the sense that they contain all information needed to recreate} the data
+description source (i.e., the ``.lc-file'').
+%
+As the type declarations (a.k.a. \emph{signatures}) are written before
+sample data on every channel, they can be used to interpret data with
+an unknown (by the receiver) format.
+
+The libraries provide functionality to subscribe to (i.e., register a
+\emph{handler} for) sample and type declarations.
+
+On the low level, the handler receives an instance of the signature
+data structure corresponding to the received declaration.
+
+
+For higher-level processing, the Java library provides the
+\verb+ASTbuilder+ class, which builds an abstract syntax tree in
+the internal representation of the LabComm compiler.
+That enables the user to use the complete functionality of the
+LabComm compiler, e.g. code generation,  on declarations received in a
+LabComm stream. 
+
+
+In combination with on-the-fly compilation and class-loading (or
+linking) that makes it possible to dynamically create handlers for
+previously unknown data types. Thereby, it enables dynamic configuration
+of LabComm endpoints in static languages without the overhead of
+interpreting signatures (at the one-time cost of code generation and
+compilation).
+
+
+
+
+
+
+
+\section{Ideas/Discussion}:
+
+The labcomm language is more expressive than its target languages regarding data types.
+E.g., labcomm can declare both arrays of arrays and matries where Java only has arrays of arrays
+In the generated Java code, a labcomm matrix is implemented as an array of arrays.
+
+Another case (not yet included) is unsigned types, which Java doesn't have. If we include
+unsigned long in labcomm, that has a larger range of values than is possible to express using
+Java primitive types. However, it is unlikely that the entire range is actually used, so one
+way of supporting the common cases is to include run-time checks for overflow in the Java encoders
+and decoders.
+
+\section{Related work}
+  
+Two in-band self-descibing communication protocols are Apache
+Avro\cite{avro} and EDN, the extensible data notation developed for
+Clojure and Datomic\cite{EDN}.
+
+EDN encodes \emph{values} as UTF-8 strings. The documentation says
+``edn is a system for the conveyance of values. It is not a type system,
+and has no schemas.'' That said, it is \emph{extensible} in the sense
+that it has a special \emph{dispatch charachter}, \verb+#+, which can  
+be used to add a \emph{tag} to a value. A tag indicates a semantic
+interpretation of a value, and that allows the reader to support
+handlers for specific tags, enabling functionality similar to that of
+labcomm.
+
+\subsection{Apache Avro}
+
+Apache Avro is similar to LabComm in that it has a textual language
+for declaring data, a binary protocol for transmitting data, and code
+generation for several languages.
+
+Avro is a larger system, including RPC \emph{protocols}, support for
+using different \emph{codecs} for data compression, and \emph{schema
+resolution} to support handling schema evolution and transparent 
+interoperability between different versions of a schema.
+
+\subsubsection*{Data types} 
+
+In the table, the Avro type names are listed, and matched to the
+corresponding LabComm type:
+
+\begin{tabular}{|l|c|c|}
+\hline
+  Type &            Labcomm  &               Avro \\
+  \hline Primitive types \\ \hline
+
+int    &         4 bytes     &           varint  \\
+long   &         8 bytes     &           varint  \\
+float  &         4 bytes     &           4 bytes \\
+long   &         8 bytes     &           8 bytes \\
+string &         varint + utf8[]   &     varint + utf8[] \\ 
+bytes  &         varint + byte[]   &     varint + byte[]\\
+
+  \hline Complex types  \\ \hline
+
+struct/record &  concat of fields     &  concat of fields \\ 
+arrays        &  varIdx[] : elements  &  block[]          \\
+map           &    n/a                &  block[]          \\
+union         &   n/a                 & (varint idx) : value \\
+fixed         &   byte[n]             &  the number of bytes declared in
+the schema\\
+\hline
+\end{tabular}
+
+  where 
+
+\begin{verbatim}  
+  block ::= (varint count) : elem[count]      [*1]
+  count == 0 --> no more blocks
+
+
+[*1] for arrays, count == 0 --> end of array
+     if count < 0, there are |count| elements
+     preceded by a varint block_size to allow
+     fast skipping
+\end{verbatim}  
+
+In maps, keys are strings, and values  according to the schema.
+
+In unions, the index indicates the kind of value and the
+value is encoded according to the schema.
+
+Note that the Avro data type \verb+bytes+ corresponds to the
+LabComm declaration \verb+byte[_]+, i.e. a varaible length byte array.
+
+\subsubsection*{the wire protocol}
+
+\begin{tabular}{|l|c|c|}
+  \hline
+  What & LabComm & Avro \\ \hline
+  Data description & Binary signature & JSON schema \\
+  Signature sent only once pre connection& posible & possible \\
+  Signature sent with each sample & possible & possible \\
+  Data encoding & binary & binary \\
+  \hline
+\end{tabular}
+
+
+Both avro and labcomm use varints when encoding data, similar in that
+they both send a sequence of bytes containing 7 bit chunks (with the
+eight bit signalling more chunks to come), but they differ in range,
+endianness and signedness.
+
+\begin{verbatim}
+                LabComm                 Avro
+                unsigned 32 bit         signed zig-zag coding
+                most significant chunk  least significant chunk
+                first                   first
+
+                0   ->  00               0  ->  00
+                1   ->  01              -1  ->  01
+                2   ->  02               1  ->  02
+                    ...                 -2  ->  03
+                                         2  ->  04
+                                            ...
+                127 ->  7f              -64 ->  7f
+                128 ->  81 00            64 ->  80 01
+                129 ->  81 01           -65 ->  81 01
+                130 ->  81 02            65 ->  82 01
+                    ...                     ...   
+\end{verbatim}
+
+\paragraph{Avro Object Container Files} can be seen as a counterpart
+  to a LabComm channel: 
+Avro includes a simple object container file format. A file has a
+schema, and all objects stored in the file must be written according to
+that schema, using binary encoding. Objects are stored in blocks that
+may be compressed. Syncronization markers are used between blocks to
+permit efficient splitting of files, and enable detection of 
+corrupt blocks.
+
+
+The major difference is the sync markers that LabComm does not have, as
+LabComm assumes that, while the transport may drop packets, there will
+be no bit errors in a received packet. If data integrity is required,
+that is delegated to the reader and writer for the particular transport.
+
+\subsubsection{Representation of hierarchical data types}
+
+For a type that contains fields of other user types, like
+\begin{verbatim}
+typedef struct {
+  int x;
+  int y;
+} Point;
+
+sample struct {
+  Point start;
+  Point end;
+} line;
+\end{verbatim}
+
+LabComm encodes both the flattened signature and the 
+typedef which allows the hierarchical type structure to be
+reconstructed.
+%
+The avro encoding is quite similar. 
+The \verb+Line+ example, corresponds to the two schemas
+\begin{verbatim}
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "Point",
+ "fields": [
+     {"name": "x", "type": "int"},
+     {"name": "y", "type": "int"}
+ ]
+}
+\end{verbatim}
+and
+\begin{verbatim}
+{"namespace": "example.avro",
+ "type": "record",
+ "name": "Line",
+ "fields": [
+     {"name": "start", "type": "Point"},
+     {"name": "end",   "type": "Point"}
+ ]
+}
+\end{verbatim}
+which is encoded in an Object Container File as
+\begin{verbatim}
+{"type":"record",
+ "name":"Line",
+ "namespace":"example.avro",
+ "fields":[{"name":"start",
+            "type":{"type":"record",
+                    "name":"Point",
+                    "fields":[{"name":"x","type":"int"},
+                              {"name":"y","type":"int"}]}},
+           {"name":"end",
+            "type":"Point"}
+ ]
+}
+\end{verbatim}
+\subsubsection{Fetures not in LabComm} 
+
+Avro has a set of features with no counterpart in LabComm. They include
+
+\paragraph{Codecs.}
+
+Avro has multiple codecs (for compression of the data):
+
+    \begin{verbatim}
+    Required Codecs:
+    - null : The "null" codec simply passes through data uncompressed.
+
+    - deflate : The "deflate" codec writes the data block using the deflate
+                algorithm as specified in RFC 1951, and typically implemented using the
+                zlib library. Note that this format (unlike the "zlib format" in RFC
+                1950) does not have a checksum.
+
+    Optional Codecs
+
+    - snappy:   The "snappy" codec uses Google's Snappy compression library. Each
+                compressed block is followed by the 4-byte, big-endian CRC32 checksum of
+                the uncompressed data in the block.
+
+    \end{verbatim}
+
+  \paragraph{Schema Resolution.} The main objective of LabComm is to
+    ensure correct operation at run-time. Therefore, a LabComm decoder
+    requires the signatures for each handled sample to match exactly.
+
+    Avro, on the other hand, supports the evolution of schemas and
+    provides support for reading data where the ordering of fields
+    differ (but names and types are the same), numerical types differ
+    but can be
+    \emph{promoted} (E.g., \verb+int+ can be promoted to \verb+long+,
+    \verb+float+, or \verb+double+.), and record fields have been added
+    or removed (but are nullable or have default values).
+
+    \paragraph{Schema fingerprints.} Avro defines a \emph{Parsing
+    Canonical Form} to define when two JSON schemas are ``the same''.
+    To reduce the overhead when, e.g., tagging data with the schema
+    there is support for creating a \emph{fingerprint} using 64/128/256
+    bit hashing, in combination with a centralized repository for
+    fingerprint/schema pairs.
+
+\bibliography{refs}{}
+\bibliographystyle{plain}
+
+\appendix
+\newpage
+
+\section{The LabComm language}
+\label{sec:LanguageGrammar}
+
+\subsection{Abstract syntax}
+\begin{verbatim}
+Specification ::= Decl*;
+
+abstract Decl ::= DataType <Name:String>;
+TypeDecl : Decl;
+SampleDecl : Decl;
+
+Field ::= DataType <Name:String>;
+
+abstract DataType;
+VoidType          : DataType;
+SampleRefType     : DataType;
+PrimType          : DataType ::= <Name:String> <Token:int>;
+UserType          : DataType ::= <Name:String>;
+StructType        : DataType ::= Field*;
+ParseArrayType    : DataType ::= DataType Dim*;
+abstract ArrayType : DataType ::= DataType Exp*;
+VariableArrayType : ArrayType;
+FixedArrayType    : ArrayType;
+
+Dim ::= Exp*;
+
+abstract Exp;
+IntegerLiteral : Exp ::= <Value:String>;
+VariableSize : Exp;
+\end{verbatim}
+
+\newpage
+\section{The LabComm protocol}
+\label{sec:ProtocolGrammar}
+
+Each LabComm2014 packet has the layout
+\begin{verbatim}
+<id> <length> <data...>
+\end{verbatim}
+where \verb+length+ is the number of bytes of the \verb+data+ part
+(i.e., excluding the \verb+id+ and \verb+length+ fields), and 
+the \verb+id+ gives the layout of the \verb+data+ part as defined 
+in \ref{sec:ConcreteGrammar}
+\subsection{Data encoding}
+LabComm primitive types are encoded as fixed width values, sent in
+network order.  Type fields, user IDs, number of indices and lengths are
+sent in a variable length (\emph{varint}) form.  A varint integer value
+is sent as a sequence of bytes where the lower seven bits contain a
+chunk of the actual number and the high bit indicates if more chunks
+follow. The sequence of chunks are sent with the least significant chunk
+first.  
+
+The built-in data types are encoded as follows:
+\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
+||Type       ||Encoding/Size                                      ||
+||-----------||---------------------------------------------------||
+||boolean    ||  8 bits                                           ||
+||byte       ||  8 bits                                           ||
+||short      || 16 bits                                           ||
+||integer    || 32 bits                                           ||
+||long       || 64 bits                                           ||
+||float      || 32 bits                                           ||
+||double     || 64 bits                                           ||
+||sample_ref || 32 bits                                           ||
+||string     || length (varint), followed by UTF8 encoded string  ||
+||array      || each variable index (varint),                     ||
+||           || followed by encoded elements                      ||
+||struct     || concatenation of encoding of each element         ||
+||           || in declaration order                              ||
+\end{lstlisting}
+\pagebreak
+\subsection{Protocol grammar}
+\label{sec:ConcreteGrammar}
+\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
+<packet>          := <id> <length> ( <version>      | 
+                                     <type_decl>    | 
+                                     <sample_decl>  |
+                                     <sample_ref>   |
+                                     <type_binding> |
+                                     <sample_data> )
+<version>         := <string>
+<sample_decl>     := <sample_id> <string> <signature>
+<sample_ref>      := <sample_id> <string> <signature>
+<type_decl>       := <type_id> <string> <signature>
+<type_binding>    := <sample_id> <type_id>
+<user_id>         := 0x40..0xffffffff  
+<sample_id> : <user_id>
+<type_id>   : <user_id>
+<string>          := <string_length> <char>*
+<string_length>   := 0x00..0xffffffff  
+<char>            := any UTF-8 char
+<signature>       := <length> <type>
+<type>            := <length> ( <basic_type>  | 
+                                <array_decl>  | 
+                                <struct_decl> | 
+                                <type_id> )
+<basic_type>      := ( <void_type> | <boolean_type> | <byte_type> | <short_type> |
+                     <integer_type> | <long_type> | <float_type> |
+                     <double_type> | <string_type> | <sample_ref_type>)
+
+<void_type>       := <struct_decl> 0 //void is encoded as empty struct
+<boolean_type>    := 0x20 
+<byte_type>       := 0x21 
+<short_type>      := 0x22 
+<integer_type>    := 0x23 
+<long_type>       := 0x24 
+<float_type>      := 0x25 
+<double_type>     := 0x26 
+<string_type>     := 0x27 
+<sample_ref_type> := 0x28 
+<array_decl>      := 0x10  <nbr_of_indices> <indices> <type>
+<nbr_of_indices>  := 0x00..0xffffffff  
+<indices>         := ( <variable_index> | <fixed_index> )*
+<variable_index>  := 0x00  
+<fixed_index>     := 0x01..0xffffffff  
+<struct_decl>     := 0x11  <nbr_of_fields> <field>*
+<nbr_of_fields>   := 0x00..0xffffffff  
+<field>           := <string> <type>
+<sample_data>     := packed sample data sent in network order, with
+                  primitive type elements encoded according to
+                  the sizes above
+\end{lstlisting}
+where the \verb+<id>+ in \verb+<packet>+ signals the type of payload,
+and may be either a \verb+<sample_id>+ or a system packet id.
+The labcomm sytem packet ids are:
+\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
+version:      0x01 
+sample_decl:  0x02 
+sample_ref:   0x03 
+type_decl:    0x04 
+type_binding: 0x05          
+\end{lstlisting}
+Note that since the signature transmitted in a \verb+<sample_def>+ is
+flattened, the \verb+<type>+ transmitted in a \verb+<sample_def>+ may
+not contain any \verb+<type_id>+ fields.
+\end{document}
--- a/examples/Makefile
+++ b/examples/Makefile
+SUBDIRS=duck_typing dynamic jgrafchart robot simple tcp \
+	twoway user_types wiki_example
+
+.PHONY: all
+
+all:
+	echo More to be done...
+	$(MAKE) -C twoway all
+
+UNAME_S=$(shell uname -s)
+
+.PHONY: test
+test:
+	echo More to be done...
+ifeq ($(UNAME_S),Darwin)
+	$(MAKE) -C user_types all 
+else	
+	cd simple ; sh compile.sh && sh run.sh
+	$(MAKE) -C  wiki_example test
+	$(MAKE) -C user_types all 
+endif	
+	$(MAKE) -C duck_typing test
+	$(MAKE) -C twoway test 
+
+.PHONY: clean
+clean: $(SUBDIRS:%=clean-%)
+clean-%:
+	$(MAKE) -C $* clean 
+
+.PHONY: distclean
+distclean: $(SUBDIRS:%=distclean-%)
+distclean-%:
+	$(MAKE) -C $* distclean 
--- a/examples/duck_typing/.gitignore
+++ b/examples/duck_typing/.gitignore
+gen
--- a/examples/duck_typing/Makefile
+++ b/examples/duck_typing/Makefile
+LABCOMM_JAR=../../compiler/labcomm2014_compiler.jar
+LABCOMM=java -jar $(LABCOMM_JAR) 
+
+all: gen/animal.py
+
+.PHONY: test
+test: gen/animal.py
+	PYTHONPATH=../../lib/python:gen ./duck_typing.py
+
+
+gen/.dir:
+	mkdir -p $@
+
+gen/%.py: %.lc | gen/.dir
+	$(LABCOMM) --python=$@ $<
+
+.PHONY: clean distclean
+clean distclean:
+	rm -rf gen
+
--- a/examples/duck_typing/animal.lc
+++ b/examples/duck_typing/animal.lc
+typedef struct {
+  string says;
+} animal;
+
+sample animal cow;
+sample animal dog;
+sample struct {
+  string says;
+} duck;
--- a/examples/duck_typing/duck_typing.py
+++ b/examples/duck_typing/duck_typing.py
+#!/usr/bin/python
+
+import labcomm2014
+import animal
+import StringIO
+
+class Animal:
+    def __init__(self):
+        self.says = None
+
+if __name__ == '__main__':
+    buf = StringIO.StringIO()
+    encoder = labcomm2014.Encoder(labcomm2014.StreamWriter(buf))
+    encoder.add_decl(animal.cow.signature)
+    encoder.add_decl(animal.dog.signature)
+    encoder.add_decl(animal.duck.signature)
+    theAnimal = Animal()
+    theAnimal.says = 'Moo'
+    encoder.encode(theAnimal, animal.cow.signature)
+    theAnimal.says = 'Bow-Wow'
+    encoder.encode(theAnimal, animal.dog.signature)
+    theAnimal.says = 'Quack'
+    encoder.encode(theAnimal, animal.duck.signature)
+    buf.seek(0)
+    decoder = labcomm2014.Decoder(labcomm2014.StreamReader(buf))
+    try:
+        while True:
+            value,decl = decoder.decode()
+            if value:
+                print decl.name, 'says', value
+                pass
+            pass
+        pass
+    except EOFError:
+        pass
+    pass
--- a/examples/dynamic/Makefile
+++ b/examples/dynamic/Makefile
+all:
+	sh dynamic.sh
+	sh dynamic_type.sh
+	sh test.sh
+	sh test_type.sh
+
+clean:
+	-rm test/*.class
+	-rm encoded_data
+	-rm dynamic_out
+
+distclean: clean
--- a/examples/dynamic/README
+++ b/examples/dynamic/README
+This directory contains a small example of how to generate and compile a
+labcomm endpoint on the fly.
+
+
+the script static.sh builds and runs a very simple static labcomm demo
+
+
+the script dynamic.sh builds and runs an example where labcomm is generated and
+woven togerther with user-defined handler code. 
+
+
+the test.sh script builds and runs the TestLabCommGen, which illustrates the
+on-the-fly compilation to RAM, reading the labcomm declarations and handlers
+from file
+
+
+The handlers declaration (in handlers.txt, handlers2.txt) is experimental, and has the
+following format:
+
+<sample name>:handler(<data type> <variable name>) { <handler method code> }###
+
+where the end marker (}###) is a kludge to avoid having to parse the method
+body while still allowing it to contain blocks. Thus, having the sequence
+"}###" in the method body breaks this. Caveat hacker!
+
+An example handlers declaration:
+
+foo:handler(foo value) {
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	System.out.println("foo handler from handlers2.txt");
+	System.out.println("using context "+ctx.str);
+        ctx.x = value.x + 1000;
+	ctx.y = value.y + 1000;
+	ctx.z = value.z + 1000;
+	System.out.println(value.x);
+	System.out.println(value.y);
+	System.out.println(value.z);
+	for(int i=0; i<value.x; i++){
+		System.out.print("."+(value.x-i));
+	}
+	System.out.println();
+}###
+bar:handler(int value) {
+	System.out.println("bar:"+value);
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	ctx.bar = value + 1000;
+}###
+
+Note that parameters differ: the value for foo is an object but for bar it is a
+primitive type (int) value.
+
+
+The context in the example is a kludge or placeholder for tying the
+user-provided handler methods to the environment they are executed in, and
+allowing state to be kept between invokations. The class containing the handler
+methods have an attribute, Object context, and its initialization is currently
+hard coded. The permanent solution is TBD.
+
+TODO: Currently, for UserType samples (i.e., if typedef is used in the .lc file)
+the type of the value parameter to encode and handler is of the UserType and not
+the sample class. Should this be kept, or changed so that the sample extends the type.
+An argument against is that for primitive types, the type of the argument is 
+the primitive type, and not the class, so that has to be taken care of anyhow.
--- a/examples/dynamic/dynamic.sh
+++ b/examples/dynamic/dynamic.sh
+#dummy script to test the on-the-fly compilation
+
+java -jar ../../compiler/labcomm2014_compiler.jar --java=gen --javapackage=gen simple.lc
+
+javac -cp .:gen:../../lib/java/labcomm2014.jar gen/*.java
+
+# compile static encoder and decoder 
+javac -cp .:gen:../../lib/java/labcomm2014.jar test/StaticEncoder.java
+javac -cp .:gen:../../lib/java/labcomm2014.jar test/StaticDecoder.java 
+
+# compile dynamic part 
+javac -cp .:../../compiler/labcomm2014_compiler.jar:../../lib/java/labcomm2014.jar:../../lib/tools/beaver.jar:../../lib/tools/beaver-rt.jar:../../lib/tools/jastadd2.jar:../../lib/tools/JFlex.jar:../../lib/tools/proj.jar  test/DynamicPart.java 
+
+javac test/HandlerContext.java
+
+# run static encoder 
+java -cp .:gen:../../lib/java//labcomm2014.jar test.StaticEncoder encoded_data
+
+# run dynamic part 
+java -cp .:../../compiler/labcomm2014_compiler.jar:../../lib/java/labcomm2014.jar:../../lib/tools/beaver.jar:../../lib/tools/beaver-rt.jar:../../lib/tools/jastadd2.jar:../../lib/tools/JFlex.jar:../../lib/tools/proj.jar  test.DynamicPart simple.lc handlers2.txt encoded_data dynamic_out
+
+
+# run static decoder 
+java -cp .:gen:../../lib/java//labcomm2014.jar test.StaticDecoder dynamic_out
--- a/examples/dynamic/dynamic_type.sh
+++ b/examples/dynamic/dynamic_type.sh
+#dummy script to test the on-the-fly compilation
+
+java -jar ../../compiler/labcomm2014_compiler.jar --java=gen --javapackage=gen simple.lc
+
+javac -cp .:gen:../../lib/java/labcomm2014.jar gen/*.java
+
+# compile static encoder and decoder 
+javac -cp .:gen:../../lib/java/labcomm2014.jar test/StaticEncoder.java
+javac -cp .:gen:../../lib/java/labcomm2014.jar test/StaticDecoder.java 
+
+# compile dynamic part 
+javac -cp .:../../compiler/labcomm2014_compiler.jar:../../lib/java/labcomm2014.jar:../../lib/tools/beaver.jar:../../lib/tools/beaver-rt.jar:../../lib/tools/jastadd2.jar:../../lib/tools/JFlex.jar:../../lib/tools/proj.jar  test/DynamicPart.java 
+
+javac test/HandlerContext.java
+
+# run static encoder 
+java -cp .:gen:../../lib/java//labcomm2014.jar test.StaticEncoder encoded_data
+
+# run dynamic part 
+java -cp .:../../compiler/labcomm2014_compiler.jar:../../lib/java/labcomm2014.jar:../../lib/tools/beaver.jar:../../lib/tools/beaver-rt.jar:../../lib/tools/jastadd2.jar:../../lib/tools/JFlex.jar:../../lib/tools/proj.jar  test.DynamicPart simple_type.lc handlers_type.txt encoded_data dynamic_out
+
+
+# run static decoder 
+java -cp .:gen:../../lib/java//labcomm2014.jar test.StaticDecoder dynamic_out
--- a/examples/dynamic/handlers.txt
+++ b/examples/dynamic/handlers.txt
+foo:handler(foo value) {
+	System.out.println("foo handler from handlers.txt");
+	System.out.println(value.x);
+	System.out.println(value.y);
+	System.out.println(value.z);
+	for(int i=0; i<value.x; i++){
+		System.out.print("."+(value.x-i));
+	}
+	System.out.println();
+}###
+bar:handler(int value) {
+	System.out.println("bar:"+value);
+}###
--- a/examples/dynamic/handlers2.txt
+++ b/examples/dynamic/handlers2.txt
+foo:handler(foo value) {
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	System.out.println("foo handler from handlers2.txt");
+	System.out.println("using context "+ctx.str);
+        ctx.x = value.x + 1000;
+	ctx.y = value.y + 1000;
+	ctx.z = value.z + 1000;
+	System.out.println(value.x);
+	System.out.println(value.y);
+	System.out.println(value.z);
+	for(int i=0; i<value.x; i++){
+		System.out.print("."+(value.x-i));
+	}
+	System.out.println();
+}###
+bar:handler(int value) {
+	System.out.println("bar:"+value);
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	ctx.bar = value + 1000;
+}###
--- a/examples/dynamic/handlers_type.txt
+++ b/examples/dynamic/handlers_type.txt
+foo:handler(foo_t value) {
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	System.out.println("foo handler from handlers_type.txt");
+	System.out.println("using context "+ctx.str);
+        ctx.x = value.x + 1000;
+	ctx.y = value.y + 1000;
+	ctx.z = value.z + 1000;
+	System.out.println(value.x);
+	System.out.println(value.y);
+	System.out.println(value.z);
+	for(int i=0; i<value.x; i++){
+		System.out.print("."+(value.x-i));
+	}
+	System.out.println();
+}###
+bar:handler(int value) {
+	System.out.println("bar:"+value);
+	test.HandlerContext ctx = (test.HandlerContext)context;
+	ctx.bar = value + 1000;
+}###
--- a/examples/dynamic/simple.lc
+++ b/examples/dynamic/simple.lc
+sample struct {
+  int x;
+  int y;
+  int z;
+} foo;
+
+sample int bar;
+
--- a/examples/dynamic/simple_type.lc
+++ b/examples/dynamic/simple_type.lc
+typedef struct {
+  int x;
+  int y;
+  int z;
+} foo_t;
+
+sample foo_t foo;
+
+sample int bar;
+
No results found