-
Notifications
You must be signed in to change notification settings - Fork 165
/
LBNF-report.tex
1091 lines (893 loc) · 38.4 KB
/
LBNF-report.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[10pt]{article}
\usepackage{setspace}
\setlength{\leftmargin}{30mm}
\setlength{\oddsidemargin}{0mm}
%\setlength{\evensidemargin}{0mm}
\setlength{\evensidemargin}{-2mm}
\setlength{\topmargin}{-16mm}
\setlength{\textheight}{240mm}
\setlength{\textwidth}{158mm}
\newcommand{\bequ}{\begin{quote}}
\newcommand{\enqu}{\end{quote}}
\newcommand{\begit}{\begin{itemize}}
\newcommand{\enit}{\end{itemize}}
\newcommand{\LBNF}{LBNF}
% commands from LBNF documentation
\newcommand{\emptyP}{\mbox{$\epsilon$}}
\newcommand{\terminal}[1]{\mbox{{\texttt {#1}}}}
\newcommand{\nonterminal}[1]{\mbox{$\langle \mbox{{\sl #1 }} \! \rangle$}}
\newcommand{\arrow}{\mbox{::=}}
\newcommand{\delimit}{\mbox{$|$}}
\newcommand{\reserved}[1]{\mbox{{\texttt {#1}}}}
\newcommand{\literal}[1]{\mbox{{\texttt {#1}}}}
\newcommand{\symb}[1]{\mbox{{\texttt {#1}}}}
\title{{\bf The Labelled BNF Grammar Formalism}}
\author{Markus Forsberg, Aarne Ranta \\
Department of Computing Science \\
Chalmers University of Technology and the University of Gothenburg \\
SE-412 96 Gothenburg, Sweden\\
\{markus, aarne\}@cs.chalmers.se}
\date{For BNF Converter Version 2.2, February 11, 2005}
\begin{document}
\maketitle
%\begin{doublespace}
\section{Introduction}
This document defines
the grammar formalism \textit{Labelled BNF} (LBNF),
which is used in the compiler construction tool
\textit{BNF Converter}.
Given a grammar written in LBNF,
the BNF Converter produces a complete compiler
front end (up to, but excluding, type checking),
i.e.\ a lexer, a parser, and an abstract
syntax definition. Moreover, it produces a pretty-printer
and a language specification in \LaTeX, as well as
a template file for the compiler back end.
Since LBNF is purely declarative,
these files can be generated in any programming language that supports
appropriate compiler front-end tools. As of Version 2.0, code can be generated
in Haskell, Java, C++, and C.
This document describes the LBNF formalism independently of code generation,
and is aimed to serve as a manual for grammar writers.
\section{A first example of LBNF grammar}
As the first example of LBNF,
consider a triple of rules defining addition expressions with ``1'':
\begin{verbatim}
EPlus. Exp ::= Exp "+" Num ;
ENum. Exp ::= Num ;
NOne. Num ::= "1" ;
\end{verbatim}
Apart from the \textit{labels}, {\tt EPlus}, {\tt ENum}, and {\tt NOne},
the rules are
ordinary BNF rules, with terminal symbols enclosed in
double quotes and nonterminals written without quotes.
The labels serve as \textit{constructors} for
syntax trees.
From an LBNF grammar, the BNF Converter extracts
an \textit{abstract syntax}\ and
a \textit{concrete syntax}.
In Haskell, for instance,
the abstract syntax is implemented as a system of
datatype definitions
\begin{verbatim}
data Exp = EPlus Exp Exp | ENum Num
data Num = NOne
\end{verbatim}
For other languages---C, C++, and Java---an equivalent
representation is given, following
the methodology defined in Appel's books series
{\em Modern compiler implementation in ML/Java/C}\footnote{
Cambridge University Press, 1998.}.
The concrete syntax is implemented by the
lexer, parser and pretty-printer algorithms,
which are defined in other generated program modules.
\section{LBNF in a nutshell}
\subsection{Basic LBNF}
Briefly, an LBNF grammar is a BNF grammar where every rule is given a label.
The label is used for constructing a syntax tree whose subtrees are
given by the nonterminals of the rule, in the same order.
More formally, an LBNF grammar consists of a collection of rules,
which have the following form (expressed by a regular expression;
Appendix gives a complete BNF definition of the notation):
\bequ
Ident "{\tt .}" Ident "{\tt ::=}" (Ident $\mid$ String)* "{\tt;}" ;
\enqu
The first identifier is the \textit{rule label}, followed by the
\textit{value category}. On the right-hand side of the production
arrow ({\tt ::=}) is the list of production items. An item is either
a quoted string (\textit{terminal}) or a category symbol (\textit{non-terminal}).
A rule whose value category is $C$ is also called a \textit{production}\ for $C$.
Identifiers, that is, rule names and category symbols, can be
chosen {\em ad libitum}, with the restrictions imposed by the target language. To
satisfy Haskell, and C and Java as well, the following rule is imposed
\bequ
An identifier is a nonempty sequence of letters, starting with a
capital letter.
\enqu
\subsection{Additional features}
Basic LBNF as defined above
is clearly sufficient for defining any context-free language. However,
it is not always convenient to define a programming language purely with BNF rules.
Therefore, some additional features are added to LBNF: abstract syntax conventions,
lexer rules, pragmas, and macros. These features are treated in the
subsequent sections.
Section~\ref{conventions} explains {\em abstract syntax conventions}.
Creating an abstract syntax by adding a node type for every BNF rule
may sometimes become too detailed,
or cluttered with extra structures.
To remedy this, we have identified the most common problem cases,
and added to LBNF
some extra conventions to handle them.
Section~\ref{lexer} explains {\em lexer rules}.
Some aspects of a language belong to its lexical
structure rather than its grammar, and are more naturally described by
regular expressions than by BNF rules. We have therefore
added to LBNF two rule formats to define the lexical structure:
{\em tokens} and {\em comments}.
Section~\ref{pragmas} explains {\em pragmas}.
Pragmas are rules instructing the BNFC grammar compiler to treat
some rules of the grammar in certain special ways: to reduce the number
of {\em entrypoints} or to treat some syntactic forms as {\em internal} only.
Section~\ref{macros} explains {\em macros}.
Macros are syntactic sugar for potentially
large groups of rules and help to write grammars
concisely. This is both for the writer's and the reader's convenience;
among other things, macros naturally force certain groups of rules to
go together, which could otherwise be spread arbitrarily in the grammar.
Section~\ref{layout} explains {\em layout syntax},
which is a non-context-free feature present in some programming
languages. LBNF has a set of rule formats for defining a
limited form of layout syntax. It works as a preprocessor that
translates layout syntax into explicit structure markers.
\section{Abstract syntax conventions}
\label{conventions}
\subsection{Predefined basic types}
The first convention are predefined basic types.
Basic types, such as integer and character, can of course be
defined in a labelled BNF, for example:
\begin{verbatim}
Char_a. Char ::= "a" ;
Char_b. Char ::= "b" ;
\end{verbatim}
This is, however, cumbersome and inefficient. Instead, we have decided to
extend our formalism with predefined basic types, and represent their
grammar as a part of lexical structure. These types are the following,
as defined by LBNF regular expressions (see \ref{reg} for
the regular expression syntax):
\bequ
{\tt Integer} of integers, defined
\verb6digit+6
{\tt Double} of floating point numbers, defined
\verb6digit+ '.' digit+ ('e' '-'? digit+)?6
{\tt Char} of characters (in single quotes), defined
\verb6'\'' ((char - ["'\\"]) | ('\\' ["'\\nt"])) '\''6
{\tt String} of strings (in double quotes), defined
\verb6'"' ((char - ["\"\\"]) | ('\\' ["\"\\nt"]))* '"'6
{\tt Ident} of identifiers, defined
\verb6letter (letter | digit | '_' | '\'')*6
\enqu
In the abstract syntax, these types are represented as corresponding
types of each language, except {\tt Ident}, for which no such type
exists. It is treated by a {\tt newtype} in Haskell,
\begin{verbatim}
newtype Ident = Ident String
\end{verbatim}
as {\tt String} in Java, and as a {\tt typedef} to
{\tt char*} in C and C++.
As the names of the types suggest,
the lexer produces high-precision variants,
for integers and floats.
Authors of applications can
truncate these numbers later if they want to have
low precision instead.
Predefined categories may not have explicit productions in the
grammar, since this would violate their predefined meanings.
\subsection{Semantic dummies}
Sometimes the concrete syntax of a language includes rules that make
no semantic difference. An example is
a BNF rule making the parser accept extra semicolons after statements:
\begin{verbatim}
Stm ::= Stm ";" ;
\end{verbatim}
As this rule is semantically dummy, we do not want to represent it by a
constructors in the abstract syntax.
Instead, we introduce the following convention:
\bequ
A rule label can be an underscore {\tt \_}, which
does not add anything to the syntax tree.
\enqu
Thus we can write the following rule in LBNF:
\begin{verbatim}
_ . Stm ::= Stm ";" ;
\end{verbatim}
Underscores are of course only meaningful as replacements of
one-argument constructors where the value type is the same as the
argument type.
Semantic dummies leave no trace in the pretty-printer.
Thus, for instance, the pretty-printer ``normalizes away''
extra semicolons.
\subsection{Precedence levels}
A common idiom in (ordinary) BNF is to use indexed variants of categories
to express precedence levels:
\begin{verbatim}
Exp3 ::= Integer ;
Exp2 ::= Exp2 "*" Exp3 ;
Exp ::= Exp "+" Exp2 ;
Exp ::= Exp2 ;
Exp2 ::= Exp3 ;
Exp3 ::= "(" Exp ")" ;
\end{verbatim}
The precedence level regulates the order of parsing, including
associativity. Parentheses lift an expression of any level
to the highest level.
A straightforward labelling of the above rules creates a grammar that does
have the desired recognition behavior, as the abstract syntax is cluttered
with type distinctions (between {\tt Exp}, {\tt Exp2}, and {\tt Exp3})
and constructors (from the last three rules) with no semantic content.
The BNF Converter solution is to distinguish among
category symbols those that are just indexed variants of each other:
\bequ
A category symbol can end with an integer index
(i.e.\ a sequence of digits), and is then treated as a type
synonym of the corresponding non-indexed symbol.
\enqu
Thus {\tt Exp2} and {\tt Exp3} are indexed variants of {\tt Exp}.
The plain {\tt Exp} is treated in the same way as {\tt Exp0}.
Transitions between indexed variants are
semantically dummy, and we do not want to represent them by
constructors in the abstract syntax. To do this, we extend the use
of underscores to indexed variants.
The example grammar above can now be labelled as follows:
\begin{verbatim}
EInt. Exp3 ::= Integer ;
ETimes. Exp2 ::= Exp2 "*" Exp3 ;
EPlus. Exp ::= Exp "+" Exp2 ;
_. Exp ::= Exp2 ;
_. Exp2 ::= Exp3 ;
_. Exp3 ::= "(" Exp ")" ;
\end{verbatim}
In Haskell, for instance, the datatype of expressions becomes simply
\begin{verbatim}
data Exp = EInt Integer | ETimes Exp Exp | EPlus Exp Exp
\end{verbatim}
and the syntax tree for {\tt 2*(3+1)} is
\begin{verbatim}
ETimes (EInt 2) (EPlus (EInt 3) (EInt 1))
\end{verbatim}
Indexed categories {\em can} be used for other purposes than
precedence, since the only thing we can formally check is the
type skeleton (see the section \ref{typecheck}).
The parser does not need to know
that the indices mean precedence, but only that indexed
variants have values of the same type.
The pretty-printer, however, assumes that
indexed categories are used for precedence, and may produce
strange results if they are used in some other way.
{\bf Hint}. See Section~\ref{coercions} for a concise way of defining
dummy coercions rules.
\subsection{Polymorphic lists}
It is easy to define monomorphic list types in LBNF:
\begin{verbatim}
NilDef. ListDef ::= ;
ConsDef. ListDef ::= Def ";" ListDef ;
\end{verbatim}
However, compiler writers in languages like
Haskell may want to use predefined
polymorphic lists, because of the language support for these constructs.
LBNF permits the use of Haskell's list constructors
as labels, and list brackets in category names:
\begin{verbatim}
[]. [Def] ::= ;
(:). [Def] ::= Def ";" [Def] ;
\end{verbatim}
As the general rule, we have
\bequ
{\tt[}$C${\tt ]}, the category of lists of type $C$,
{\tt []} and {\tt (:)}, the Nil and Cons rule labels,
{\tt (:[])}, the rule label for one-element lists.
\enqu
The third rule label is used to place an at-least-one restriction,
but also to permit special treatment of one-element lists
in the concrete syntax.
In the \LaTeX\ document (for stylistic reasons) and in the Happy file (for
syntactic reasons), the category name {\tt [X]} is replaced by {\tt ListX}.
In order for this not to cause clashes, {\tt ListX}
may not be at the same time used explicitly in the grammar.
The list category constructor can be iterated: {\tt [[X]]}, {\tt [[[X]]]}, etc
behave in the expected way.
The list notation can also be seen as a variant of the Kleene star and plus, and
hence as an ingredient from Extended BNF.
In other languages than Haskell, monomorphic variants of lists are generated
automatically.
{\bf Hint}. See Section~\ref{terminator} for concise ways of defining
lists by just giving their terminators or separators.
\subsection{The type-correctness of LBNF rules}
\label{typecheck}
It is customary in parser generators to delegate the checking of certain
errors to the target language. For instance, a Happy source file that
Happy processes without complaints can still produce a Haskell file
that is rejected by Haskell. In the same way, the BNF converter
delegates some checking to the generated language (for instance,
the parser conflict check). However, since it is always
the easiest for the programmer to understand error messages
related to the source, the BNF Converter performs some checks,
which are mostly connected with the sanity of the abstract syntax.
The type checker uses a notion of the
\textit{category skeleton} of a rule, which is a pair
\[
(C, A\ldots B)
\]
where $C$ is the unindexed left-hand-side non-terminal and $A\ldots B$
is the sequence of unindexed right-hand-side non-terminals of the rule.
In other words, the category skeleton of a rule expresses the abstract-syntax
type of the semantic action associated to that rule.
We also need the notions of
a \textit{regular category} and
a \textit{regular rule label}.
Briefly, regular labels and categories are the user-defined ones.
More formally,
a regular category is none of
{\tt[}$C${\tt]},{\tt Integer}, {\tt Double}, {\tt Char}, {\tt String}
and {\tt Ident}, or the types defined by {\tt token} rules (Section~\ref{token}).
A regular rule label is none of
{\tt \_}, {\tt []}, {\tt (:)}, and {\tt (:[])}.
The type checking rules are now the following:
\bequ
A rule labelled by {\tt \_} must have a category skeleton of form $(C,C)$.
A rule labelled by {\tt []} must have a category skeleton of form $([C],)$.
A rule labelled by {\tt (:)} must have a category skeleton of form $([C],C[C])$.
A rule labelled by {\tt (:[])} must have a category skeleton of form $([C],C)$.
Only regular categories may have productions with regular rule labels.
Every regular category occurring in the grammar
must have at least one production with a regular rule label.
All rules with the same regular rule label must have the same
category skeleton.
\enqu
The second-last rule corresponds to the absence of empty data types in Haskell.
The last rule could
be strengthened so as to require that all regular rule labels be unique:
this is needed to guarantee error-free pretty-printing.
Violating this strengthened rule currently
generates only a warning, not a type error.
\section{Lexer Definitions}
\label{lexer}
\subsection{The {\tt token} rule}
\label{reg}\label{token}
The {\tt token} rule enables the LBNF programmer
to define new lexical types using
a simple regular expression notation.
For instance, the following rule defines the type of
identifiers beginning with upper-case letters.
\begin{verbatim}
token UIdent (upper (letter | digit | '_')*) ;
\end{verbatim}
The type {\tt UIdent} becomes usable
as an LBNF nonterminal and as a type in the abstract syntax.
Each token type is implemented by a {\tt newtype} in Haskell, as
a {\tt String} in Java, and as a {\tt typedef} to {\tt char*} in C/C++.
The regular expression syntax of LBNF is specified in the Appendix.
The abbreviations with strings in brackets need a word of explanation:
\begin{quote}
\verb6["abc7%"]6 denotes the union of the characters \verb6'a' 'b' 'c' '7' '%'6
\verb6{"abc7%"}6 denotes the sequence of the characters \verb6'a' 'b' 'c' '7' '%'6
\end{quote}
The atomic expressions {\tt upper}, {\tt lower}, {\tt letter}, and {\tt digit}
denote the character classes suggested by their names (letters are isolatin1).
The expression {\tt char} matches any unicode character, and
the ``epsilon'' expression {\tt eps} matches the empty string.
Thus {\tt eps} is equivalent to \verb6{""}6, whereas the empty language is
expressed by {\tt [""]}.
{\bf Note}. The empty language is not available for the Java lexer tool JLex.
\subsection{The {\tt position token} rule}
\label{reg}\label{postoken}
(As of \today, only available for Haskell).
Any {\tt token} rule can be modified by the word
{\tt position}, which has the effect that the
datatype defined will carry position information.
For instance,
\begin{verbatim}
position token PIdent (letter (letter|digit|'_'|'\'')*) ;
\end{verbatim}
creates in Haskell the datatype definition
\begin{verbatim}
newtype PIdent = PIdent ((Int,Int),String)
\end{verbatim}
where the pair of integers indicates the line and column of
the first character of the token. The pretty-printer omits
the position component.
\subsection{The {\tt comment} rule}
\textit{Comments} are segments of source code that include free
text and are not passed to the parser. The natural place
to deal with them is in the lexer. The {\tt comment} rule instructs the
lexer generator to treat certain pieces of text as comments.
The comment rule takes one or two string arguments. The first
string defines how a comment begins.
The second, optional string marks the end of a comment;
if it is not given then the comment is ended by a newline.
For instance, the Java comment convention is defined as follows:
\begin{verbatim}
comment "//" ;
comment "/*" "*/" ;
\end{verbatim}
\section{LBNF Pragmas}
\label{pragmas}
\subsection{Internal pragmas}
Sometimes we want to include in the abstract syntax
structures that are not part of the concrete syntax,
and hence not parsable.
They can be, for instance, syntax trees that are produced by a
type-annotating type checker.
Even though they are not parsable, we may want to
pretty-print them, for instance, in the type checker's
error messages.
To define such an internal constructor, we use a pragma
\begin{verbatim}
"internal" Rule ";"
\end{verbatim}
where {\tt Rule} is a normal LBNF rule. For instance,
\begin{verbatim}
internal EVarT. Exp ::= "(" Ident ":" Type ")";
\end{verbatim}
introduces a type-annotated variant of a variable expression.
\subsection{Entry point pragmas}
The BNF Converter generates, by default, a parser for every category in
the grammar. This is unnecessarily rich in most cases, and makes the parser
larger than needed. If the size of the parser becomes critical,
the \textit{entry points pragma} enables the user
to define which of the parsers are actually exported:
\begin{verbatim}
entrypoints (Ident ",")* Ident ;
\end{verbatim}
For instance, the following pragma defines {\tt Stm} and {\tt Exp} to be
the only entry points:
\begin{verbatim}
entrypoints Stm, Exp ;
\end{verbatim}
\section{LBNF macros}
\label{macros}
\subsection{Terminators and separators}
\label{terminator}
The \verb$terminator$ macro defines a pair of list rules by what
token terminates each element in the list. For instance,
\begin{verbatim}
terminator Stm ";" ;
\end{verbatim}
tells that each statement (\verb$Stm$) is terminated with a semicolon
(\verb$;$). It is a shorthand for the pair of rules
\begin{verbatim}
[]. [Stm] ::= ;
(:). [Stm] ::= Stm ";" [Stm] ;
\end{verbatim}
The qualifier \verb$nonempty$ in the macro makes one-element list
to be the base case. Thus
\begin{verbatim}
terminator nonempty Stm ";" ;
\end{verbatim}
is shorthand for
\begin{verbatim}
(:[]). [Stm] ::= Stm ";" ;
(:). [Stm] ::= Stm ";" [Stm] ;
\end{verbatim}
The terminator can be specified as empty \verb$""$. No token is
introduced then, but e.g.
\begin{verbatim}
terminator Stm "" ;
\end{verbatim}
is translated to
\begin{verbatim}
[]. [Stm] ::= ;
(:). [Stm] ::= Stm [Stm] ;
\end{verbatim}
The \verb$separator$ macro is similar to \verb$terminator$,
except that the separating token is not attached to the last
element. Thus
\begin{verbatim}
separator Stm ";" ;
\end{verbatim}
means
\begin{verbatim}
[]. [Stm] ::= ;
(:[]). [Stm] ::= Stm ;
(:). [Stm] ::= Stm ";" [Stm] ;
\end{verbatim}
whereas
\begin{verbatim}
separator nonempty Stm ";" ;
\end{verbatim}
means
\begin{verbatim}
(:[]). [Stm] ::= Stm ;
(:). [Stm] ::= Stm ";" [Stm] ;
\end{verbatim}
Notice that, if the empty token \verb$""$ is used, there
is no difference between \verb$terminator$ and \verb$separator$.
{\bf Problem}.
The grammar generated from
a \verb$separator$ without \verb$nonempty$
will actually
also accept a list terminating with a semicolon, whereas
the pretty-printer ``normalizes'' it away. This might be considered
a bug, but a set of rules
forbidding the terminating semicolon would be much more
complicated. The \verb$nonempty$ case is strict.
\subsection{Coercions}
\label{coercions}
The \verb$coercions$ macro is a shorthand for a group of rules
translating between precedence levels. For instance,
\begin{verbatim}
coercions Exp 3 ;
\end{verbatim}
is shorthand for
\begin{verbatim}
_. Exp ::= Exp1 ;
_. Exp1 ::= Exp2 ;
_. Exp2 ::= Exp3 ;
_. Exp3 ::= "(" Exp ")" ;
\end{verbatim}
Because of the total coverage of these coercions,
it does not matter if the integer indicating the highest level
(here \verb$3$) is bigger than the highest level actually occurring,
or if there are some other levels without productions in the grammar.
\subsection{Rules}
The \verb$rules$ macro is a shorthand for a set of rules from which
labels are generated automatically. For instance,
\begin{verbatim}
rules Type ::= Type "[" Integer "]" | "float" | "double" | Type "*" ;
\end{verbatim}
is shorthand for
\begin{verbatim}
Type_0. Type ::= Type "[" Integer "]" ;
Type_float. Type ::= "float" ;
Type_double. Type ::= "double" ;
Type_3. Type ::= Type "*" ;
\end{verbatim}
The labels are created automatically. A label starts with
the value category name. If the
production has just one item, which is moreover
possible as a part of an identifier, that item is
used as a suffix.
In other cases, an integer suffix is used. No global checks are
performed when generating these labels. Any label name clashes that
result from them are captured by BNFC type checking on the generated
rules.
Notice that, using the \verb$rules$ macro, it is possible to define an
LBNF grammar without giving any labels. To guarantee the uniqueness
of labels, productions of the each category must be grouped together.
\section{Layout syntax}
\label{layout}
Layout syntax is a means of using indentation to group
program elements. It is used in some languages, e.g.\ Haskell.
Those who do not know what layout syntax is or who do not like
it can skip this section.
The pragmas {\tt layout}, {\tt layout stop}, and {\tt layout toplevel}
define a {\em layout syntax} for a language.
Before these pragmas were added, layout syntax was not definable in
BNFC. The layout pragmas are only
available for the files generated for Haskell-related tools;
if Java, C, or C++ programmers want to handle layout, they can use
the Haskell layout resolver as a preprocessor to their
front end, before the lexer. In Haskell, the layout resolver
appears, automatically, in its most natural place, which is between the
lexer and the parser. The layout pragmas of BNFC are not
powerful enough to handle the full layout rule of Haskell 98,
but they suffice for the ``regular'' cases.
Here is an example, found in the the grammar of the logical framework Alfa.
\begin{verbatim}
layout "of", "let", "where", "sig", "struct" ;
\end{verbatim}
The first line says that {\tt "of", "let", "where", "sig", "struct"}
are {\em layout words}, i.e. start a
{\em layout list}. A layout list is a list of expressions
normally enclosed in curly brackets and separated by semicolons,
as shown by the Alfa example
\begin{verbatim}
ECase. Exp ::= "case" Exp "of" "{" [Branch] "}" ;
separator Branch ";" ;
\end{verbatim}
When the layout resolver finds the token {\tt of} in the code
(i.e. in the sequence of its lexical tokens), it
checks if the next token is an opening curly bracket. If it
is, nothing special is done until a layout word is encountered again.
The parser will expect the semicolons and the closing bracket
to appear as usual.
But, if the token $t$
following {\tt of} is not an opening curly bracket,
a bracket is inserted, and the start column of $t$
is remembered as the position at which the elements of the
layout list must begin. Semicolons are inserted at those
positions. When a token is eventually encountered left
of the position of $t$ (or an end-of-file), a closing bracket
is inserted at that point.
Nested layout blocks are allowed, which means that the layout
resolver maintains a stack of positions. Pushing a position
on the stack corresponds to inserting a left bracket, and
popping from the stack corresponds to inserting a right bracket.
Here is an example of an Alfa source file using layout:
\begin{verbatim}
c :: Nat = case x of
True -> b
False -> case y of
False -> b
Neither -> d
d = case x of True -> case y of False -> g
x -> b
y -> h
\end{verbatim}
Here is what it looks like after layout resolution:
\begin{verbatim}
c :: Nat = case x of {
True -> b
;False -> case y of {
False -> b
};Neither -> d
};d = case x of {True -> case y of {False -> g
;x -> b
};y -> h} ;
\end{verbatim}
There are two more layout-related pragmas. The {\tt layout stop}
pragma, as in
\begin{verbatim}
layout stop "in" ;
\end{verbatim}
tells the resolver that the layout list can be exited with
some stop words, like {\tt in}, which exits a
{\tt let} list. It is no error in the resolver to exit
some other kind of layout list with {\tt in}, but an error will show up
in the parser.
The {\tt layout toplevel} pragma tells that the whole source file
is a layout list, even though no layout word indicates this.
The position is the first column, and the resolver adds a semicolon
after every paragraph whose first token is at this position. No curly
brackets are added. The Alfa file above is an example of this, with
two such semicolons added.
To make layout resolution a stand-alone program, e.g.\ to serve as a
preprocessor, the programmer can modify the BNFC source file
{\tt ResolveLayoutAlfa.hs} as indicated in the file,
and either compile it or run it in the Hugs interpreter by
\begin{verbatim}
runhugs ResolveLayoutX.hs <X-source-file>
\end{verbatim}
We may add the generation of {\tt ResolveLayoutX.hs}
to a later version of BNFC.
{\bf Bug}. The generated layout resolver does not work correctly
if a layout word is the first token on a line.
\section{Profiles}
\label{profile}
This section explains a feature which is not intended to be used
in LBNF grammars written by hand, but in ones generated from
the grammar formalism GF (Grammatical Framework).
GF supports grammars of natural-languages
and also higher-order abstract syntax which is sometimes used for
formal languages to define their static semantics.
The reader not familiar with these matters can skip this section.
The relation between abstract and concrete syntax in LBNF is the
simplest possible one: the subtrees of an abstract syntax tree
are in one-to-one correspondence with the nonterminals in
the parsing grammar. The GF formalism generalizes this relation to one in which
permutations, omissions, and duplications can occur in the
transition from abstract and concrete syntax. The way back
then requires a rearrangement of the subtrees, which involves
unification in the case of omissions and duplications.
It is also possible to conceive some concrete-syntax constituents
as bound variables, as is the case in higher-order abstract syntax.
The recipe for doing this postprocessing can be compactly
expressed by a \textit{profile}, which has a list of positions
of each argument. For instance, the profiles in basic LBNF look
as follows:
\begin{verbatim}
While ([],[0])([],[1])([],[2]). Stm ::= "while" "(" Exp ")" Stm Stm ;
\end{verbatim}
That is, each abstract argument occurs exactly once in the concrete
expression, and in the same order. The syntax trees produced have the form
\begin{verbatim}
While Ext Stm Stm
\end{verbatim}
The first components in each list of pairs
are for variable bindings. An example is the
variable declaration rule
\begin{verbatim}
Decl ([],[0])([[1]],[2]). Stm ::= Typ Ident ";" Stm ;
\end{verbatim}
which creates the abstract syntax
\begin{verbatim}
Decl Typ (\Ident -> Stm)
\end{verbatim}
An (artificial) example of duplication would be
\begin{verbatim}
IsAlways ([],[0,1]). Sentence ::= "a" Noun "is" "always" "a" Noun ;
\end{verbatim}
which produces trees of the form
\begin{verbatim}
IsAlways Noun
\end{verbatim}
and would accept strings like \textit{a man is always a man},
\textit{a bike is always a bike}, but not \textit{a man is always a bike}.
\section{An optimization: left-recursive lists}
\label{leftrec}
The BNF representation of lists is right-recursive, following the
list conctructor in Haskell and most other languages.
Right-recursive lists, however, are an inefficient way of parsing
lists in an LALR parser, because they can blow up the stack size.
The smart programmer would implement a pair of rules
such as
\begin{verbatim}
[]. [Stm] ::= ;
(:). [Stm] ::= Stm ";" [Stm] ;
\end{verbatim}
not in the direct way,
but under a left-recursive transformation, as if we wrote,
\begin{verbatim}
[]. [Stm] ::= ;
(flip (:)). [Stm] ::= [Stm] Stm ";" ;
\end{verbatim}
Then the smart programmer would also be careful to {\tt reverse} the list
when it is used as an argument of another rule construction.
The BNF Converter automatically performs the
left-recursion transformation for
pairs of rules of the form
\begin{verbatim}
[]. [C] ::= ;
(:). [C] ::= C x [C] ;
\end{verbatim}
where {\tt C} is any category and {\tt x} is any sequence of
terminals (possibly empty). These rules can, of course, be generated from
the {\tt terminator} macro (Section~\ref{terminator}).
{\bf Notice}. The transformation is currently not performed if the
one-element list is the base case.
\newpage
\section*{Appendix: LBNF Specification}
This document was automatically generated by the {\em BNF-Converter}. It was generated together with the lexer, the parser, and the abstract syntax module, which guarantees that the document matches with the implementation of the language (provided no hand-hacking has taken place).
\section*{The lexical structure of BNF}
\subsection*{Identifiers}
Identifiers \nonterminal{Ident} are unquoted strings beginning with a letter,
followed by any combination of letters, digits, and the characters {\tt \_ '},
reserved words excluded.
\subsection*{Literals}
String literals \nonterminal{String}\ have the form
\terminal{"}$x$\terminal{"}, where $x$ is any sequence of any characters
except \terminal{"}\ unless preceded by \verb6\6.
Integer literals \nonterminal{Int}\ are nonempty sequences of digits.
Character literals \nonterminal{Char}\ have the form
\terminal{'}$c$\terminal{'}, where $c$ is any single character.
\subsection*{Reserved words and symbols}
The set of reserved words is the set of terminals appearing in the grammar. Those reserved words that consist of non-letter characters are called symbols, and they are treated in a different way from those that are similar to identifiers. The lexer follows rules familiar from languages like Haskell, C, and Java, including longest match and spacing conventions.
The reserved words used in BNF are the following: \\
\begin{tabular}{lll}
{\reserved{char}} &{\reserved{coercions}} &{\reserved{comment}} \\
{\reserved{digit}} &{\reserved{entrypoints}} &{\reserved{eps}} \\
{\reserved{internal}} &{\reserved{layout}} &{\reserved{letter}} \\
{\reserved{lower}} &{\reserved{nonempty}} &{\reserved{position}} \\
{\reserved{rules}} &{\reserved{separator}} &{\reserved{stop}} \\
{\reserved{terminator}} &{\reserved{token}} &{\reserved{toplevel}} \\
{\reserved{upper}} & & \\
\end{tabular}\\
The symbols used in BNF are the following: \\
\begin{tabular}{lll}
{\symb{;}} &{\symb{.}} &{\symb{::{$=$}}} \\
{\symb{[}} &{\symb{]}} &{\symb{\_}} \\
{\symb{(}} &{\symb{:}} &{\symb{)}} \\
{\symb{,}} &{\symb{{$|$}}} &{\symb{{$-$}}} \\
{\symb{*}} &{\symb{{$+$}}} &{\symb{?}} \\
{\symb{\{}} &{\symb{\}}} & \\
\end{tabular}\\
\subsection*{Comments}
Single-line comments begin with {\symb{{$-$}{$-$}}}. \\Multiple-line comments are enclosed with {\symb{\{{$-$}}} and {\symb{{$-$}\}}}.
\section*{The syntactic structure of BNF}
Non-terminals are enclosed between $\langle$ and $\rangle$.
The symbols {\arrow} (production), {\delimit} (union)
and {\emptyP} (empty rule) belong to the BNF notation.
All other symbols are terminals.\\
\begin{tabular}{lll}
{\nonterminal{Grammar}} & {\arrow} &{\nonterminal{ListDef}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{ListDef}} & {\arrow} &{\emptyP} \\
& {\delimit} &{\nonterminal{Def}} {\terminal{;}} {\nonterminal{ListDef}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{ListItem}} & {\arrow} &{\emptyP} \\
& {\delimit} &{\nonterminal{Item}} {\nonterminal{ListItem}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{Def}} & {\arrow} &{\nonterminal{Label}} {\terminal{.}} {\nonterminal{Cat}} {\terminal{::{$=$}}} {\nonterminal{ListItem}} \\
& {\delimit} &{\terminal{comment}} {\nonterminal{String}} \\
& {\delimit} &{\terminal{comment}} {\nonterminal{String}} {\nonterminal{String}} \\
& {\delimit} &{\terminal{internal}} {\nonterminal{Label}} {\terminal{.}} {\nonterminal{Cat}} {\terminal{::{$=$}}} {\nonterminal{ListItem}} \\
& {\delimit} &{\terminal{token}} {\nonterminal{Ident}} {\nonterminal{Reg}} \\
& {\delimit} &{\terminal{position}} {\terminal{token}} {\nonterminal{Ident}} {\nonterminal{Reg}} \\
& {\delimit} &{\terminal{entrypoints}} {\nonterminal{ListIdent}} \\
& {\delimit} &{\terminal{separator}} {\nonterminal{MinimumSize}} {\nonterminal{Cat}} {\nonterminal{String}} \\
& {\delimit} &{\terminal{terminator}} {\nonterminal{MinimumSize}} {\nonterminal{Cat}} {\nonterminal{String}} \\
& {\delimit} &{\terminal{coercions}} {\nonterminal{Ident}} {\nonterminal{Integer}} \\
& {\delimit} &{\terminal{rules}} {\nonterminal{Ident}} {\terminal{::{$=$}}} {\nonterminal{ListRHS}} \\
& {\delimit} &{\terminal{layout}} {\nonterminal{ListString}} \\
& {\delimit} &{\terminal{layout}} {\terminal{stop}} {\nonterminal{ListString}} \\
& {\delimit} &{\terminal{layout}} {\terminal{toplevel}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{Item}} & {\arrow} &{\nonterminal{String}} \\
& {\delimit} &{\nonterminal{Cat}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{Cat}} & {\arrow} &{\terminal{[}} {\nonterminal{Cat}} {\terminal{]}} \\
& {\delimit} &{\nonterminal{Ident}} \\
\end{tabular}\\
\begin{tabular}{lll}
{\nonterminal{Label}} & {\arrow} &{\nonterminal{LabelId}} \\
& {\delimit} &{\nonterminal{LabelId}} {\nonterminal{ListProfItem}} \\
& {\delimit} &{\nonterminal{LabelId}} {\nonterminal{LabelId}} {\nonterminal{ListProfItem}} \\
\end{tabular}\\
\begin{tabular}{lll}