Awk Script Course
Awk Script Course
selection, validation:
"Print all lines longer than 80 characters"
length > 80
transforming, rearranging:
"Replace the 2nd field by its logarithm"
{ $2 = log($2); print }
report generation:
"Add up the numbers in the first field,
then print the sum and average"
{ sum += $1 }
END { print sum, sum/NR }
pattern { action }
pattern { action }
Operation:
for each file
for each input line
for each pattern
if pattern matches input line
do the action
Usage:
awk 'program' [ file1 file2 ... ]
awk -f progfile [ file1 file2 ... ]
1
AWK features:
{ nc += length($0) + 1; nw += NF } wc command
END { print NR, "lines", nw, "words", nc, "characters" }
2
Awk text formatter
#!/bin/sh
# f - format text into 60-char lines
awk '
/./ { for (i = 1; i <= NF; i++)
addword($i) }
/^$/ { printline(); print "" }
END { printline() }
function addword(w) {
if (length(line) + length(w) > 60)
printline()
line = line space w
space = " "
}
function printline() {
if (length(line) > 0)
print line
line = space = ""
}
' "$@"
Arrays
Reverse a file:
Making an array:
n = split(string, array, separator)
splits "string" into array[1] ... array[n]
returns number of elements
optional "separator" can be any regular expression
3
Associative Arrays
Input:
pizza 200
beer 100
pizza 500
beer 50
Output:
pizza 700
beer 150
program:
{ amount[$1] += $2 }
END { for (name in amount)
print name, amount[name] | "sort +1 -nr"
}
zero const 0
sum const
4
Assembler and simulator/intepreter
# asm - assembler and interpreter for simple computer
# usage: awk -f asm program -file data -files...
BEGIN {
srcfile = ARGV[1]
ARGV[1] = "" # remaining files are data
tempfile = " asm.temp"
n = split("const get put ld st add sub jpos jz j halt", x)
for (i = 1; i <= n; i++) # create table of op codes
op[x[i]] = i -1
# ASSEMBLER PASS 1
FS = "[ \t]+"
while (getline <srcfile > 0) {
sub(/#.*/, "") # strip comments
symtab [$1] = nextmem # remember label location
if ($2 != "") { # save op, addr if present
print $2 "\t" $3 >tempfile
nextmem++
}
}
close( tempfile )
# ASSEMBLER PASS 2
nextmem = 0
while (getline <tempfile > 0) {
if ($2 !~ /^[0-9]*$/) # if symbolic addr,
$2 = symtab [$2] # replace by numeric value
mem[nextmem++] = 1000 * op[$1] + $2 # pack into word
}
# INTERPRETER
for (pc = 0; pc >= 0; ) {
addr = mem [pc] % 1000
code = int (mem[pc++] / 1000)
if (code == op["get"]) { getline acc }
else if (code == op["put"]) { print " \t" acc }
else if (code == op[" st"]) { mem[addr] = acc }
else if (code == op["ld"]) { acc = mem[addr ] }
else if (code == op["add"]) { acc += mem[addr ] }
else if (code == op["sub"]) { acc -= mem[addr ] }
else if (code == op[" jpos "]) { if (acc > 0) pc = addr }
else if (code == op[" jz"]) { if (acc == 0) pc = addr }
else if (code == op["j"]) { pc = addr }
else if (code == op["halt"]) { pc = -1 }
else { pc = -1 }
}
}
Anatomy of a compiler
input
lexical
analysis
tokens
syntax symbol
analysis table
intermediate
form
code
generation
object
file linking
5
Anatomy of an interpreter
input
lexical
analysis
tokens
syntax symbol
analysis table
intermediate
form
input execution
output
data
NF > 0 {
f = 1
e = expr()
if (f <= NF) printf("error at %s\n", $f)
else printf("\t%.8g\n", e)
}
function expr( e) { # term | term [+-] term
e = term()
while ($f == "+" || $f == "-")
e = $(f++) == "+" ? e + term() : e - term()
return e
}
function term( e) { # factor | factor [*/] factor
e = factor()
while ($f == "*" || $f == "/")
e = $(f++) == "*" ? e * factor() : e / factor()
return e
}
function factor( e) { # number | (expr)
if ($f ~ /^[+-]?([0-9]+[.]?[0-9 ]*|[.][0-9]+)$/) {
return $(f++)
} else if ($f == "(") {
f++
e = expr()
if ($(f++) != ")")
printf("error: missing ) at %s\n", $f)
return e
} else {
printf("error: expected number or ( at %s \n", $f)
return 0
}
}
6
YACC and LEX
YACC overview
7
YACC-based
%{
calculator
#define YYSTYPE double /* data type of yacc stack */
%}
%token NUMBER
%left '+' '-' /* left associative, same precedence */
%left '*' '/' /* left assoc., higher precedence */
%%
list: expr '\n' { printf("\t%.8g\n", $1); }
| list expr '\n' { printf("\t%.8g\n", $2); }
;
expr : NUMBER { $$ = $1; }
| expr '+' expr { $$ = $1 + $3; }
| expr '-' expr { $$ = $1 - $3; }
| expr '*' expr { $$ = $1 * $3; }
| expr '/' expr { $$ = $1 / $3; }
| '(' expr ')' { $$ = $2; }
;
%%
/* end of grammar */
main() { /* calculator */
yyparse();
}
yylex() { /* calculator lexical analysis */
int c;
while ((c= getchar()) == ' ' || c == ' \t')
;
if (c == EOF)
return 0;
if (c == '.' || isdigit (c)) { /* number */
ungetc (c, stdin);
scanf("% lf", & yylval ); /* lexical value */
return NUMBER; /* lexical type */
}
if (c == ' \n')
lineno ++;
return c;
}
yyerror(char *s) { /* called for yacc syntax error */
fprintf(stderr , "%s near line %d\n", s, lineno );
}
8
Grammar specified in YACC
statement:
IF ( expression ) statement
create node(IF, expr, stmt, 0)
IF ( expression ) statement ELSE statement
create node(IF, expr, stmt1, stmt2)
WHILE (expression ) statement
create node(WHILE, expr, stmt)
variable = expression
create node(ASSIGN, var, expr)
expression:
expression + expression
expression - expression
...
YACC creates a parser from this
when the parser runs, it creates a parse tree
9
Excerpts from a LEX analyzer
"++" { yylval.i = INCR; RET(INCR); }
"--" { yylval.i = DECR; RET(DECR); }
([0-9]+(\.?)[0-9]*|\.[0-9]+)([eE](\+|-)?[0-9]+)? {
yylval.cp = setsymtab(yytext, tostring(yytext),
atof(yytext), CON|NUM, symtab);
RET(NUMBER); }
while { RET(WHILE); }
for { RET(FOR); }
do { RET(DO); }
if { RET(IF); }
else { RET(ELSE); }
return {
if (!infunc)
ERROR "return not in function" SYNTAX;
RET(RETURN);
}
. { RET(yylval.i = yytext[0]);
/* everything else */
}
Whole process
grammar lexical
rules
Lex (or
YACC
other) other
C code
y.tab.c lex.yy.c
parser analyzer
C compiler
a.out
10
AWK implementation
Testing
11
Using awk for testing RE code
^a.$ ~ ax
aa
!~ xa
aaa
axy
illustrates
little languages
programs that write programs
mechanization
Lessons
12