diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bfddfe35..5c069676 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,7 @@ jobs: # Testing with gcc clang in case they have different warnings for comp in gcc clang-14; do make pnut-sh CC=$comp + make pnut-awk CC=$comp make pnut-exe CC=$comp TARGET=Linux.i386 EXE_ONE_PASS=0 make pnut-exe CC=$comp TARGET=Linux.i386 EXE_ONE_PASS=1 make pnut-exe CC=$comp TARGET=Linux.i386 SAFE=1 NICE_UX=1 @@ -70,6 +71,42 @@ jobs: check_dir "*.c" tests # and the tests directory recursively check_dir "*.c" examples 1 # and examples + compile-in-safe-mode: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y coreutils time + + - name: Compile pnut-sh, pnut-exe and tests in safe mode + run: | + set -e + export SAFE=1 + + make pnut-sh.sh + make pnut-sh.sh MINIMAL=1 + + make pnut-awk.awk + make pnut-awk.awk MINIMAL=1 + + for target in Linux.x86_64 Linux.i386; do + for minimal in 0 1; do + for one_pass in 0 1; do + make pnut-exe TARGET=$target MINIMAL=$minimal EXE_ONE_PASS=$one_pass + done + done + done + + ./run-tests.sh sh --safe --compile-only + ./run-tests.sh x86_64_linux --safe --compile-only + ./run-tests.sh x86_64_linux --safe --one-pass-generator --compile-only + ./run-tests.sh i386_linux --safe --compile-only + ./run-tests.sh i386_linux --safe --one-pass-generator --compile-only + tests-exe: # Run tests for pnut-exe on all supported platforms and architectures strategy: matrix: @@ -183,7 +220,34 @@ jobs: make bootstrap-pnut-sh MINIMAL=1 make bootstrap-pnut-sh MINIMAL=0 - bootstrap-pnut-exe: + bootstrap-pnut-awk: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Install AWK and dependencies + run: | + set -e + sudo apt-get update + sudo apt-get install -y coreutils time gawk mawk + + - name: Bootstrap pnut-awk with AWK + run: | + set -e + for awk_impl in gawk mawk; do + export BOOTSTRAP_AWK="$awk_impl --posix" + # For mawk, only test with MINIMAL=0 because it fails to parse the + # larger file (hitting some internal limit of mawk?). + if [ $awk_impl = "mawk" ]; then + make bootstrap-pnut-awk MINIMAL=1 + else + make bootstrap-pnut-awk MINIMAL=1 + make bootstrap-pnut-awk MINIMAL=0 + fi + done + + bootstrap-pnut-exe-on-shell: strategy: matrix: shell: ["bash", "dash", "ksh", "mksh", "yash", "zsh", "osh"] @@ -239,9 +303,9 @@ jobs: # One pass generator is not supported on macOS return fi - make bootstrap-pnut-exe-no-shell EXE_ONE_PASS=$one_pass MINIMAL=$minimal - make bootstrap-pnut-exe-script EXE_ONE_PASS=$one_pass MINIMAL=$minimal make bootstrap-pnut-exe EXE_ONE_PASS=$one_pass MINIMAL=$minimal + make bootstrap-pnut-exe-from-pnut-shell EXE_ONE_PASS=$one_pass MINIMAL=$minimal + make bootstrap-pnut-exe-from-shell EXE_ONE_PASS=$one_pass MINIMAL=$minimal } # For fast shells (ksh, dash, bash), test both the one-pass and minimal options @@ -267,7 +331,7 @@ jobs: ;; esac - bootstrap-pnut-sh-with-pnut-exe: + bootstrap-pnut-exe-on-awk: strategy: matrix: target: ["Linux.i386", "Linux.x86_64", "Darwin.x86_64"] @@ -283,54 +347,82 @@ jobs: - name: Checkout code uses: actions/checkout@v2 - - name: Install dependencies + - name: Install AWK and dependencies run: | + set -e if [ ${{ matrix.host }} = "macos-latest" ]; then - brew install coreutils + brew install coreutils gawk mawk else sudo apt-get update - sudo apt-get install -y coreutils time + sudo apt-get install -y coreutils time build-essential gawk mawk fi - - name: Bootstrap pnut-sh with pnut-exe on ${{ matrix.target }} backend + - name: Bootstrap pnut-exe with ${{ matrix.target }} backend on AWK run: | set -e + export BOOTSTRAP_AWK=awk export TARGET=${{ matrix.target }} - make bootstrap-pnut-sh-with-pnut-exe MINIMAL=0 - make bootstrap-pnut-sh-with-pnut-exe MINIMAL=1 - compile-in-safe-mode: - runs-on: ubuntu-latest + do_bootstrap() { + one_pass=$1 + minimal=$2 + if [ ${{ matrix.host }} = "macos-latest" ] && [ $one_pass -eq 1 ]; then + # One pass generator is not supported on macOS + return + fi + make bootstrap-pnut-exe EXE_ONE_PASS=$one_pass MINIMAL=$minimal + make bootstrap-pnut-exe-from-pnut-awk EXE_ONE_PASS=$one_pass MINIMAL=$minimal + make bootstrap-pnut-exe-from-awk EXE_ONE_PASS=$one_pass MINIMAL=$minimal + } + + for awk_impl in gawk mawk; do + export BOOTSTRAP_AWK="$awk_impl --posix" + for minimal in 0 1; do + if [ $awk_impl = "mawk" ] && [ $minimal -eq 0 ]; then + # For mawk, only test with MINIMAL=0 because it fails to parse + # the larger file (hitting some internal limit of mawk?). + continue + fi + for one_pass in 0 1; do + do_bootstrap $one_pass $minimal + done + done + done + + bootstrap-pnut-sh-awk-with-pnut-exe: + strategy: + matrix: + target: ["Linux.i386", "Linux.x86_64", "Darwin.x86_64"] + include: + - target: Linux.i386 + host: ubuntu-latest + - target: Linux.x86_64 + host: ubuntu-latest + - target: Darwin.x86_64 + host: macos-latest + runs-on: ${{ matrix.host }} steps: - name: Checkout code uses: actions/checkout@v2 - name: Install dependencies run: | - sudo apt-get update - sudo apt-get install -y coreutils time + if [ ${{ matrix.host }} = "macos-latest" ]; then + brew install coreutils + else + sudo apt-get update + sudo apt-get install -y coreutils time + fi - - name: Compile pnut-sh, pnut-exe and tests in safe mode + - name: Bootstrap pnut-sh with pnut-exe on ${{ matrix.target }} backend run: | set -e - export SAFE=1 - - make pnut-sh.sh - make pnut-sh.sh MINIMAL=1 - - for target in Linux.x86_64 Linux.i386; do - for minimal in 0 1; do - for one_pass in 0 1; do - make pnut-exe TARGET=$target MINIMAL=$minimal EXE_ONE_PASS=$one_pass - done - done - done + export TARGET=${{ matrix.target }} + make bootstrap-pnut-sh-with-pnut-exe MINIMAL=0 + make bootstrap-pnut-sh-with-pnut-exe MINIMAL=1 - ./run-tests.sh sh --safe --compile-only - ./run-tests.sh x86_64_linux --safe --compile-only - ./run-tests.sh x86_64_linux --safe --one-pass-generator --compile-only - ./run-tests.sh i386_linux --safe --compile-only - ./run-tests.sh i386_linux --safe --one-pass-generator --compile-only + make bootstrap-pnut-awk-with-pnut-exe MINIMAL=0 + make bootstrap-pnut-awk-with-pnut-exe MINIMAL=1 bootstrap-bash-2_05a: runs-on: ubuntu-latest @@ -383,19 +475,21 @@ jobs: # Keeping the fast option on bash 2.05a because its very slow otherwise PNUT_OPTIONS='$COMPATIBILITY_OPTIONS' ./run-tests.sh sh --shell bash --fast + # Bootstrap pnut-exe with pnut-exe with bash 2.05a (fastest step) + make bootstrap-pnut-exe MINIMAL=1 + make bootstrap-pnut-exe MINIMAL=0 + # Bootstrap pnut-sh.sh with bash 2.05a make bootstrap-pnut-sh MINIMAL=1 make bootstrap-pnut-sh # Bootstrap pnut-exe.sh with bash 2.05a - make bootstrap-pnut-exe-script MINIMAL=1 - make bootstrap-pnut-exe-script + make bootstrap-pnut-exe-from-pnut-shell MINIMAL=1 + make bootstrap-pnut-exe-from-pnut-shell MINIMAL=0 # Bootstrap pnut-exe with bash 2.05a - make bootstrap-pnut-exe MINIMAL=1 - make bootstrap-pnut-exe MINIMAL=0 - make bootstrap-pnut-exe-no-shell MINIMAL=1 - make bootstrap-pnut-exe-no-shell MINIMAL=0 + make bootstrap-pnut-exe-from-shell MINIMAL=1 + make bootstrap-pnut-exe-from-shell MINIMAL=0 EOF pnut-variants-run: @@ -494,13 +588,17 @@ jobs: runs-on: ubuntu-latest needs: [ build-without-warnings , catch-bad-whitespace + , compile-in-safe-mode , tests-exe , tests-shell , bootstrap-pnut-sh - , bootstrap-pnut-exe - , bootstrap-pnut-sh-with-pnut-exe + , bootstrap-pnut-awk + , bootstrap-pnut-exe-on-shell + , bootstrap-pnut-exe-on-awk + , bootstrap-pnut-sh-awk-with-pnut-exe , bootstrap-bash-2_05a , pnut-variants-run + , compile-with-M2-Planet ] steps: diff --git a/.gitignore b/.gitignore index 348d7e7b..f898cc7a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,9 @@ .DS_Store */.DS_Store # Tests artifacts -tests/*.sh -tests/**/*.exe tests/**/*.sh +tests/**/*.awk +tests/**/*.exe tests/**/*.err tests/**/*.output tests/**/*-gcc diff --git a/Makefile b/Makefile index f4d5ab5c..bb0a7af8 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,17 @@ .PHONY: \ - pnut-sh pnut-sh.sh pnut-sh-bootstrapped.sh pnut-exe pnut-exe.sh pnut-exe-bootstrapped \ - install uninstall clean \ - test-sh test-i386-linux test-x86_64-linux test-x86_64-mac \ + pnut-sh pnut-sh.sh pnut-sh-bootstrapped.sh \ + pnut-awk pnut-awk.awk pnut-awk-bootstrapped.awk \ + pnut-exe pnut-exe.sh pnut-exe.awk pnut-exe-bootstrapped \ + install install-pnut-awk install-pnut-exe \ + uninstall clean \ + test-sh test-awk test-i386-linux test-x86_64-linux test-x86_64-mac \ pnut-artifact-x86 pnut-artifact-arm \ - bootstrap-pnut-sh bootstrap-pnut-exe-script bootstrap-pnut-exe \ - bootstrap-pnut-exe-no-shell bootstrap-pnut-sh-with-pnut-exe + bootstrap-pnut-sh bootstrap-pnut-exe-from-pnut-shell bootstrap-pnut-exe-from-shell \ + bootstrap-pnut-awk bootstrap-pnut-exe-from-pnut-awk bootstrap-pnut-exe-from-awk \ + bootstrap-pnut-exe bootstrap-pnut-sh-with-pnut-exe BUILD_DIR = build +export LC_ALL=C # PNUT_BUILD_OPT can be used to pass additional compilation flags to pnut ifeq ($(CFLAGS),) @@ -20,6 +25,7 @@ endif # Bootstrap targets with integrated options BOOTSTRAP_SHELL ?= /bin/sh +BOOTSTRAP_AWK ?= awk # Targets for pnut-exe: auto, Linux.i386, Linux.x86_64, Darwin.x86_64, Darwin.arm64 TARGET ?= auto @@ -55,7 +61,7 @@ endif # Bootstrap script options that can be passed via make variables # Examples: # make pnut-sh.sh MINIMAL=1 # Build minimal pnut-sh.sh -# make pnut-sh.sh MINIMAL=1 SH_ANNOTATE=1 # Build minimal annotated pnut-sh.sh +# make pnut-sh.sh MINIMAL=1 ANNOTATE_C_CODE=1 # Build minimal annotated pnut-sh.sh # make pnut-exe TARGET=Linux.x86_64 EXE_ONE_PASS=1 # Build exe for Linux x86_64 with one-pass generator # Include only features used to bootstrap pnut @@ -75,7 +81,7 @@ STATS ?= 0 # Generate faster shell code that's less readable SH_FAST ?= 0 # Annotate generated shell code with comments -SH_ANNOTATE ?= 0 +ANNOTATE_C_CODE ?= 0 # Use compact runtime library in shell scripts SH_COMPACT_RT ?= 0 # Shell scripts count memory usage (for development/debugging) @@ -112,8 +118,8 @@ endif ifeq ($(SH_FAST),1) BOOTSTRAP_FLAGS += -DSH_SAVE_VARS_WITH_SET endif -ifeq ($(SH_ANNOTATE),1) - BOOTSTRAP_FLAGS += -DSH_INCLUDE_C_CODE +ifeq ($(ANNOTATE_C_CODE),1) + BOOTSTRAP_FLAGS += -DANNOTATE_WITH_C_CODE endif ifeq ($(SH_COMPACT_RT),1) BOOTSTRAP_FLAGS += -DRT_COMPACT @@ -130,6 +136,7 @@ ifeq ($(EXE_ONE_PASS),1) endif BUILD_OPT_SH = -Dtarget_sh $(PNUT_BUILD_OPT) $(BOOTSTRAP_FLAGS) +BUILD_OPT_AWK = -Dtarget_awk $(PNUT_BUILD_OPT) $(BOOTSTRAP_FLAGS) BUILD_OPT_EXE += $(BOOTSTRAP_FLAGS) build: @@ -147,6 +154,18 @@ pnut-sh-bootstrapped.sh: pnut-sh.sh @chmod +x $(BUILD_DIR)/pnut-sh-bootstrapped.sh diff $(BUILD_DIR)/pnut-sh.sh $(BUILD_DIR)/pnut-sh-bootstrapped.sh +pnut-awk: build pnut.c awk.c + $(CC) $(CFLAGS) $(BUILD_OPT_AWK) pnut.c -o $(BUILD_DIR)/pnut-awk + +pnut-awk.awk: pnut-awk + ./$(BUILD_DIR)/pnut-awk $(BUILD_OPT_AWK) pnut.c > $(BUILD_DIR)/pnut-awk.awk + @chmod +x $(BUILD_DIR)/pnut-awk.awk + +pnut-awk-bootstrapped.awk: pnut-awk.awk + $(BOOTSTRAP_AWK) -f $(BUILD_DIR)/pnut-awk.awk -- $(BUILD_OPT_AWK) pnut.c > $(BUILD_DIR)/pnut-awk-bootstrapped.awk + @chmod +x $(BUILD_DIR)/pnut-awk-bootstrapped.awk + diff $(BUILD_DIR)/pnut-awk.awk $(BUILD_DIR)/pnut-awk-bootstrapped.awk + pnut-exe: build pnut.c x86.c exe.c elf.c mach-o.c $(CC) $(CFLAGS) $(BUILD_OPT_EXE) pnut.c -o $(BUILD_DIR)/pnut-exe @@ -154,6 +173,10 @@ pnut-exe.sh: pnut-sh pnut.c x86.c exe.c elf.c mach-o.c ./$(BUILD_DIR)/pnut-sh $(BUILD_OPT_EXE) pnut.c > $(BUILD_DIR)/pnut-exe.sh @chmod +x $(BUILD_DIR)/pnut-exe.sh +pnut-exe.awk: pnut-awk pnut.c x86.c exe.c elf.c mach-o.c + ./$(BUILD_DIR)/pnut-awk $(BUILD_OPT_EXE) pnut.c > $(BUILD_DIR)/pnut-exe.awk + @chmod +x $(BUILD_DIR)/pnut-exe.awk + pnut-exe-bootstrapped: pnut-exe $(BUILD_DIR)/pnut-exe $(BUILD_OPT_EXE) pnut.c -o $(BUILD_DIR)/pnut-exe-bootstrapped @chmod +x $(BUILD_DIR)/pnut-exe-bootstrapped @@ -168,15 +191,21 @@ install: pnut-sh pnut-sh.sh cp $(BUILD_DIR)/pnut-sh $(DESTDIR)$(PREFIX)/bin/pnut cp $(BUILD_DIR)/pnut-sh.sh $(DESTDIR)$(PREFIX)/bin/pnut-sh.sh -install-pnut-exe: pnut-exe pnut-exe.sh +install-pnut-awk: pnut-awk pnut-awk.awk + cp $(BUILD_DIR)/pnut-awk $(DESTDIR)$(PREFIX)/bin/pnut-awk + cp $(BUILD_DIR)/pnut-awk.awk $(DESTDIR)$(PREFIX)/bin/pnut-awk.awk + +install-pnut-exe: pnut-exe pnut-exe.sh pnut-exe.awk cp $(BUILD_DIR)/pnut-exe $(DESTDIR)$(PREFIX)/bin/pnut-exe cp $(BUILD_DIR)/pnut-exe.sh $(DESTDIR)$(PREFIX)/bin/pnut-exe.sh + cp $(BUILD_DIR)/pnut-exe.awk $(DESTDIR)$(PREFIX)/bin/pnut-exe.awk uninstall: $(RM) $(DESTDIR)$(PREFIX)/bin/pnut $(RM) $(DESTDIR)$(PREFIX)/bin/pnut-sh.sh $(RM) $(DESTDIR)$(PREFIX)/bin/pnut-exe $(RM) $(DESTDIR)$(PREFIX)/bin/pnut-exe.sh + $(RM) $(DESTDIR)$(PREFIX)/bin/pnut-exe.awk clean: $(RM) -r $(BUILD_DIR) @@ -189,6 +218,9 @@ clean: test-sh: ./run-tests.sh "sh" +test-awk: + ./run-tests.sh "awk" + test-i386-linux: ./run-tests.sh "i386_linux" @@ -210,11 +242,16 @@ process to allow each part to be tested individually. The **bootstrap test** is used to verify that the step output is in a good enough state to recompile and reproduce itself bit-for-bit. -The bootstrap steps are: +The shell bootstrap steps are: 1) Bootstrap pnut-sh.sh from pnut-sh.sh: bootstrap-pnut-sh -2) Bootstrap pnut-exe.sh from pnut-sh.sh: bootstrap-pnut-exe-script -3) Bootstrap pnut-exe from pnut-exe.sh: bootstrap-pnut-exe -4) Bootstrap pnut-exe from pnut-exe: bootstrap-pnut-exe-no-shell +2) Bootstrap pnut-exe.sh from pnut-sh.sh: bootstrap-pnut-exe-from-pnut-shell +3) Bootstrap pnut-exe from pnut-exe.sh: bootstrap-pnut-exe-from-shell +4) Bootstrap pnut-exe from pnut-exe: bootstrap-pnut-exe + +The same can be done for AWK with the following steps: +1) Bootstrap pnut-awk.awk from pnut-awk.awk: bootstrap-pnut-awk +2) Bootstrap pnut-exe.awk from pnut-awk.awk: bootstrap-pnut-exe-from-pnut-awk +3) Bootstrap pnut-exe from pnut-exe.awk: bootstrap-pnut-exe-from-awk In principle, these steps depend on the output of the previous step. However, to speed up testing, the bootstrap compiler of each step is produced using the @@ -240,8 +277,18 @@ bootstrap-pnut-sh: pnut-sh.sh fi @echo "Success!" +# Bootstrap pnut-sh with pnut-sh.sh (obtained using $(CC)). +bootstrap-pnut-awk: pnut-awk.awk + @echo "Bootstrapping pnut-awk.awk from pnut-awk.awk..." + $(TIMEC) $(BOOTSTRAP_AWK) -f $(BUILD_DIR)/pnut-awk.awk -- $(BUILD_OPT_AWK) pnut.c > $(BUILD_DIR)/pnut-awk-bootstrapped.awk + @if ! diff $(BUILD_DIR)/pnut-awk.awk $(BUILD_DIR)/pnut-awk-bootstrapped.awk >/dev/null 2>&1; then \ + echo "FAILURE: Bootstrap scripts differ"; \ + exit 1; \ + fi + @echo "Success!" + # Bootstrap pnut-exe.sh with pnut-sh.sh (obtained using $(CC)). -bootstrap-pnut-exe-script: pnut-sh.sh pnut-exe.sh +bootstrap-pnut-exe-from-pnut-shell: pnut-sh.sh pnut-exe.sh @echo "Bootstrapping pnut-exe.sh from pnut-sh.sh..." $(TIMEC) $(BOOTSTRAP_SHELL) $(BUILD_DIR)/pnut-sh.sh $(BUILD_OPT_EXE) pnut.c > $(BUILD_DIR)/pnut-exe-bootstrapped.sh @if ! diff $(BUILD_DIR)/pnut-exe.sh $(BUILD_DIR)/pnut-exe-bootstrapped.sh >/dev/null 2>&1; then \ @@ -250,8 +297,18 @@ bootstrap-pnut-exe-script: pnut-sh.sh pnut-exe.sh fi @echo "Success!" -# Bootstrap pnut-exe from pnut-exe (by $(CC)). -bootstrap-pnut-exe: pnut-exe.sh pnut-exe-bootstrapped +# Bootstrap pnut-exe.awk with pnut-sh.awk (obtained using $(CC)). +bootstrap-pnut-exe-from-pnut-awk: pnut-awk.awk pnut-exe.awk + @echo "Bootstrapping pnut-exe.awk from pnut-awk.awk..." + $(TIMEC) $(BOOTSTRAP_AWK) -f $(BUILD_DIR)/pnut-awk.awk -- $(BUILD_OPT_EXE) pnut.c > $(BUILD_DIR)/pnut-exe-bootstrapped.awk + @if ! diff $(BUILD_DIR)/pnut-exe.awk $(BUILD_DIR)/pnut-exe-bootstrapped.awk >/dev/null 2>&1; then \ + echo "FAILURE: Bootstrap scripts differ"; \ + exit 1; \ + fi + @echo "Success!" + +# Bootstrap pnut-exe from pnut-exe.sh +bootstrap-pnut-exe-from-shell: pnut-exe.sh pnut-exe-bootstrapped @echo "Bootstrapping pnut-exe from pnut-exe.sh..." $(TIMEC) $(BOOTSTRAP_SHELL) $(BUILD_DIR)/pnut-exe.sh $(BUILD_OPT_EXE) pnut.c -o $(BUILD_DIR)/pnut-exe-bootstrapped-again @if ! diff $(BUILD_DIR)/pnut-exe-bootstrapped $(BUILD_DIR)/pnut-exe-bootstrapped-again >/dev/null 2>&1; then \ @@ -260,7 +317,17 @@ bootstrap-pnut-exe: pnut-exe.sh pnut-exe-bootstrapped fi @echo "Success!" -bootstrap-pnut-exe-no-shell: pnut-exe-bootstrapped +# Bootstrap pnut-exe from pnut-exe.awk +bootstrap-pnut-exe-from-awk: pnut-exe.awk pnut-exe-bootstrapped + @echo "Bootstrapping pnut-exe from pnut-exe.awk..." + $(TIMEC) $(BOOTSTRAP_AWK) -f $(BUILD_DIR)/pnut-exe.awk -- $(BUILD_OPT_EXE) pnut.c -o $(BUILD_DIR)/pnut-exe-bootstrapped-again + @if ! diff $(BUILD_DIR)/pnut-exe-bootstrapped $(BUILD_DIR)/pnut-exe-bootstrapped-again >/dev/null 2>&1; then \ + echo "FAILURE: Bootstrap executables differ"; \ + exit 1; \ + fi + @echo "Success!" + +bootstrap-pnut-exe: pnut-exe-bootstrapped @echo "Bootstrapping pnut-exe from pnut-exe..." @$(RM) $(BUILD_DIR)/pnut-exe-bootstrapped-again # MacOS behaves differently if the file exists $(TIMEC) $(BUILD_DIR)/pnut-exe-bootstrapped $(BUILD_OPT_EXE) pnut.c -o $(BUILD_DIR)/pnut-exe-bootstrapped-again @@ -283,3 +350,17 @@ bootstrap-pnut-sh-with-pnut-exe: pnut-exe-bootstrapped pnut-sh.sh exit 1; \ fi @echo "Success!" + +# For completeness, bootstrap pnut-sh from pnut-exe, then recompile pnut-sh from +# the bootstrapped pnut-sh. +bootstrap-pnut-awk-with-pnut-exe: pnut-exe-bootstrapped pnut-awk.awk + @echo "Bootstrapping pnut-awk from pnut-exe..." + @$(RM) $(BUILD_DIR)/pnut-awk-from-pnut-exe # MacOS behaves differently if the file exists + $(TIMEC) $(BUILD_DIR)/pnut-exe $(BUILD_OPT_AWK) pnut.c -o $(BUILD_DIR)/pnut-awk-from-pnut-exe + @chmod +x $(BUILD_DIR)/pnut-awk-from-pnut-exe + $(BUILD_DIR)/pnut-awk-from-pnut-exe $(BUILD_OPT_AWK) pnut.c > $(BUILD_DIR)/pnut-awk-from-pnut-exe-again.awk + @if ! diff $(BUILD_DIR)/pnut-awk.awk $(BUILD_DIR)/pnut-awk-from-pnut-exe-again.awk >/dev/null 2>&1; then \ + echo "FAILURE: Bootstrap scripts differ"; \ + exit 1; \ + fi + @echo "Success!" diff --git a/README.md b/README.md index f2eeb7cf..cf3dd4d6 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ its shell version `pnut-exe.sh`: Compilation options can be used to change the generated shell script: -- `SH_ANNOTATE=1` includes the original C code in the generated shell +- `ANNOTATE_C_CODE=1` includes the original C code in the generated shell script. - `SH_COMPACT_RT=1` reduces the size of the runtime library at the cost of reduced I/O performance. @@ -63,7 +63,7 @@ Compilation options can be used to change the generated shell script: - `MINIMAL=1` support only the minimal set of C features required to bootstrap pnut, reducing the size of `pnut-sh.sh` and `pnut-exe.sh`. -They can be set using `make install SH_ANNOTATE=1 ...`. +They can be set using `make install ANNOTATE_C_CODE=1 ...`. ## How to Use @@ -161,8 +161,8 @@ the bootstrap script to include TCC and GCC is ongoing. ### Annotated Shell Scripts `pnut-sh` can include C code annotations in the generated shell scripts (with -the `SH_ANNOTATE=1` makefile option) to make them self-contained and easier to -audit. These annotations correspond to the original C source code, with the +the `ANNOTATE_C_CODE=1` makefile option) to make them self-contained and easier +to audit. These annotations correspond to the original C source code, with the lines inside inactive `#if`/`#ifdef` blocks removed, with each top-level shell declaration prefixed with its corresponding C code as comment. @@ -174,7 +174,7 @@ matches the embedded C code. This can be done with the following commands: ```shell # Generate pnut-sh.sh with annotations -> make pnut-sh.sh SH_ANNOTATE=1 +> make pnut-sh.sh ANNOTATE_C_CODE=1 # Extract C code > /bin/sh build/pnut-sh.sh -C build/pnut-sh.sh > build/pnut-sh.c # Recompile C code @@ -188,7 +188,7 @@ executable version of `pnut-exe`: ```shell # Generate pnut-exe.sh with annotations -> make pnut-exe.sh SH_ANNOTATE=1 +> make pnut-exe.sh ANNOTATE_C_CODE=1 # Extract C code > /bin/sh build/pnut-exe.sh -C build/pnut-exe.sh > build/pnut-exe.c # Recompile C diff --git a/analysis/measure-file-size.sh b/analysis/measure-file-size.sh index 4b0edf4b..7a083d65 100755 --- a/analysis/measure-file-size.sh +++ b/analysis/measure-file-size.sh @@ -37,19 +37,27 @@ expand_includes() { # $1 = output-name, $2 = options if echo "$2" | grep -q "\-DPNUT_BOOTSTRAP"; then ./$TEMP_DIR/pnut-sh-bootstrap pnut.c $2 > "$TEMP_DIR/$1.sh" ./$TEMP_DIR/pnut-sh-bootstrap "$TEMP_DIR/$1.c" $2 > "$TEMP_DIR/$1-preincluded.sh" + ./$TEMP_DIR/pnut-awk-bootstrap pnut.c $2 > "$TEMP_DIR/$1.awk" + ./$TEMP_DIR/pnut-awk-bootstrap "$TEMP_DIR/$1.c" $2 > "$TEMP_DIR/$1-preincluded.awk" else ./$TEMP_DIR/pnut-sh pnut.c $2 > "$TEMP_DIR/$1.sh" ./$TEMP_DIR/pnut-sh "$TEMP_DIR/$1.c" $2 > "$TEMP_DIR/$1-preincluded.sh" + ./$TEMP_DIR/pnut-awk pnut.c $2 > "$TEMP_DIR/$1.awk" + ./$TEMP_DIR/pnut-awk "$TEMP_DIR/$1.c" $2 > "$TEMP_DIR/$1-preincluded.awk" fi # Because we use the __FILE__ macro in pnut, the preincluded.sh file will have # a different path than the original file. We need to replace the path in the # preincluded file with the path of the original file. # Note: | is used as the delimiter because the path contains / - cat "$TEMP_DIR/$1-preincluded.sh" | sed "s|$TEMP_DIR/$1.c|pnut.c|" > "$TEMP_DIR/$1-preincluded-canonical.sh" + cat "$TEMP_DIR/$1-preincluded.sh" | sed "s|$TEMP_DIR/$1.c|pnut.c|" > "$TEMP_DIR/$1-preincluded-canonical.sh" + cat "$TEMP_DIR/$1-preincluded.awk" | sed "s|$TEMP_DIR/$1.c|pnut.c|" > "$TEMP_DIR/$1-preincluded-canonical.awk" diff -q "$TEMP_DIR/$1.sh" "$TEMP_DIR/$1-preincluded-canonical.sh" || \ { echo "Error: $1.sh and $1-preincluded-canonical.sh differ"; exit 1; } + + diff -q "$TEMP_DIR/$1.awk" "$TEMP_DIR/$1-preincluded-canonical.awk" || \ + { echo "Error: $1.awk and $1-preincluded-canonical.awk differ"; exit 1; } } included_files() { @@ -98,7 +106,7 @@ measure_size() { # $1 = output-name, $2 = options printf "By file (without comments or blank lines):\n" wc $cleaned_files printf "Expanded includes:\n" - wc "$TEMP_DIR/$1.c" "$TEMP_DIR/$1.sh" + wc "$TEMP_DIR/$1.c" "$TEMP_DIR/$1.sh" "$TEMP_DIR/$1.awk" printf "Ratio (Original): "; lines_ratio "$(wc -l < $TEMP_DIR/$1.sh)" "$(wc -l < $TEMP_DIR/$1.c)" printf "Ratio (Cleaned): "; lines_ratio "$(wc -l < $TEMP_DIR/$1.sh)" "$(wc -l $cleaned_files | tail -n 1 | awk '{print $1}')" @@ -111,11 +119,19 @@ gcc -o "$TEMP_DIR/pnut-includes" pnut.c -DDEBUG_EXPAND_INCLUDES gcc -o "$TEMP_DIR/pnut-sh" pnut.c -Dtarget_sh # Compile pnut-sh-bootstrap gcc -o "$TEMP_DIR/pnut-sh-bootstrap" pnut.c -Dtarget_sh -DPNUT_BOOTSTRAP +# Compile pnut-awk +gcc -o "$TEMP_DIR/pnut-awk" pnut.c -Dtarget_awk +# Compile pnut-awk-bootstrap +gcc -o "$TEMP_DIR/pnut-awk-bootstrap" pnut.c -Dtarget_awk -DPNUT_BOOTSTRAP # Measuring for pnut-sh measure_size "pnut-sh" "-Dtarget_sh" measure_size "pnut-minimal-sh" "-Dtarget_sh -DPNUT_BOOTSTRAP" +# Measuring for pnut-awk +measure_size "pnut-awk" "-Dtarget_awk" +measure_size "pnut-minimal-awk" "-Dtarget_awk -DPNUT_BOOTSTRAP" + # ...and for the other targets measure_size "pnut-i386_linux" "-Dtarget_i386_linux" measure_size "pnut-minimal-i386_linux" "-Dtarget_i386_linux -DPNUT_BOOTSTRAP" diff --git a/awk-runtime.c b/awk-runtime.c new file mode 100644 index 00000000..42ffad07 --- /dev/null +++ b/awk-runtime.c @@ -0,0 +1,413 @@ +// Produce the AWK runtime library + +#define DEFAULT_USE 0 + +// Bitwise operations + +bool runtime_use_and = DEFAULT_USE; +bool runtime_and_defined = false; +void runtime_and() { + if (runtime_and_defined++) return; + putstr("function and(a, b, r, m) {\n"); + putstr(" a = int(a); b = int(b)\n"); + putstr(" if (a < 0) a += 4294967296\n"); + putstr(" if (b < 0) b += 4294967296\n"); + putstr(" r = 0; m = 1\n"); + putstr(" while (a > 0 && b > 0) {\n"); + putstr(" if ((a % 2) == 1 && (b % 2) == 1) r += m\n"); + putstr(" a = int(a / 2); b = int(b / 2)\n"); + putstr(" m *= 2\n"); + putstr(" }\n"); + putstr(" if (r >= 2147483648) r -= 4294967296\n"); + putstr(" return r\n"); + putstr("}\n\n"); +} + +bool runtime_use_or = DEFAULT_USE; +bool runtime_or_defined = false; +void runtime_or() { + if (runtime_or_defined++) return; + putstr("function or(a, b, r, m) {\n"); + putstr(" a = int(a); b = int(b)\n"); + putstr(" if (a < 0) a += 4294967296\n"); + putstr(" if (b < 0) b += 4294967296\n"); + putstr(" r = 0; m = 1\n"); + putstr(" while (a > 0 || b > 0) {\n"); + putstr(" if ((a % 2) == 1 || (b % 2) == 1) r += m\n"); + putstr(" a = int(a / 2); b = int(b / 2)\n"); + putstr(" m *= 2\n"); + putstr(" }\n"); + putstr(" if (r >= 2147483648) r -= 4294967296\n"); + putstr(" return r\n"); + putstr("}\n\n"); +} + +bool runtime_use_xor = DEFAULT_USE; +bool runtime_xor_defined = false; +void runtime_xor() { + if (runtime_xor_defined++) return; + putstr("function xor(a, b, r, m) {\n"); + putstr(" a = int(a); b = int(b)\n"); + putstr(" if (a < 0) a += 4294967296\n"); + putstr(" if (b < 0) b += 4294967296\n"); + putstr(" r = 0; m = 1\n"); + putstr(" while (a > 0 || b > 0) {\n"); + putstr(" if ((a % 2) != (b % 2)) r += m\n"); + putstr(" a = int(a / 2); b = int(b / 2)\n"); + putstr(" m *= 2\n"); + putstr(" }\n"); + putstr(" if (r >= 2147483648) r -= 4294967296\n"); + putstr(" return r\n"); + putstr("}\n\n"); +} + +bool runtime_use_compl = DEFAULT_USE; +bool runtime_compl_defined = false; +void runtime_compl() { + if (runtime_compl_defined++) return; + putstr("function compl(a) {\n"); + putstr(" return -int(a) - 1\n"); + putstr("}\n\n"); +} + +bool runtime_use_lshift = DEFAULT_USE; +bool runtime_lshift_defined = false; +void runtime_lshift() { + if (runtime_lshift_defined++) return; + putstr("function lshift(a, b, r) {\n"); + putstr(" r = int(int(a) * (2 ^ int(b)))\n"); + putstr(" r = r % 4294967296\n"); + putstr(" if (r >= 2147483648) r -= 4294967296\n"); + putstr(" return r\n"); + putstr("}\n\n"); +} + +bool runtime_use_rshift = DEFAULT_USE; +bool runtime_rshift_defined = false; +void runtime_rshift() { + if (runtime_rshift_defined++) return; + putstr("function rshift(a, b, r, i, m) {\n"); + putstr(" a = int(a); b = int(b)\n"); + putstr(" if (a >= 0) {\n"); + putstr(" r = int(a / (2 ^ b))\n"); + putstr(" } else {\n"); + putstr(" a += 4294967296\n"); + putstr(" r = int(a / (2 ^ b))\n"); + putstr(" # Sign extension\n"); + putstr(" m = 2147483648\n"); + putstr(" for (i = 0; i < b; i++) {\n"); + putstr(" r += m\n"); + putstr(" m = int(m / 2)\n"); + putstr(" }\n"); + putstr(" if (r >= 2147483648) r -= 4294967296\n"); + putstr(" }\n"); + putstr(" return r\n"); + putstr("}\n\n"); +} + +bool runtime_use_comma = DEFAULT_USE; +bool runtime_comma_defined = false; +void runtime_comma() { + if (runtime_comma_defined++) return; + putstr("function comma(v1, v2) {\n"); + putstr(" return v2\n"); + putstr("}\n\n"); +} + +// memory allocation + +bool runtime_use_malloc = DEFAULT_USE; +bool runtime_malloc_defined = false; +void runtime_malloc() { + if (runtime_malloc_defined++) return; + putstr("function _malloc(size) {\n"); + putstr(" return (__ALLOC += size) - size;\n"); + putstr("}\n\n"); +} + +bool runtime_use_free = DEFAULT_USE; +bool runtime_free_defined = false; +void runtime_free() { + if (runtime_free_defined++) return; + putstr("function _free(ptr) {\n"); + putstr(" return 0\n"); + putstr("}\n\n"); +} + +// helpers + +bool runtime_use_defstr = DEFAULT_USE; +bool runtime_defstr_defined = false; +void runtime_defstr() { + if (runtime_defstr_defined++) return; + runtime_malloc(); + putstr("function unpack_string_to_buf(str, addr, len, chars, i, c, v) {\n"); + putstr(" len = split(str, chars, \"\")\n"); + putstr(" for (i = 1; i <= len; i++) {\n"); + putstr(" c = chars[i]\n"); + putstr(" if (c == \"\\\\\") {\n"); + putstr(" i++\n"); + putstr(" c = chars[i]\n"); + putstr(" if (c == \"0\") v = 0\n"); + putstr(" else if (c == \"n\") v = 10\n"); + putstr(" else if (c == \"r\") v = 13\n"); + putstr(" else if (c == \"t\") v = 9\n"); + putstr(" else if (c == \"v\") v = 11\n"); + putstr(" else if (c == \"f\") v = 12\n"); + putstr(" else if (c == \"a\") v = 7\n"); + putstr(" else if (c == \"b\") v = 8\n"); + putstr(" else if (c == \"\\\\\") v = 92\n"); + putstr(" else if (c == \"\\\"\") v = 34\n"); + putstr(" else if (c == \"'\") v = 39\n"); + putstr(" else if (c == \"$\") v = 36\n"); + putstr(" else if (c == \"`\") v = 96\n"); + putstr(" else v = ord[c]\n"); + putstr(" } else {\n"); + putstr(" v = ord[c]\n"); + putstr(" }\n"); + putstr(" _[addr++] = v\n"); + putstr(" }\n"); + putstr(" _[addr] = 0\n"); + putstr(" return addr\n"); + putstr("}\n"); + putstr("\n"); + putstr("function defstr(str, addr) {\n"); + putstr(" if (str in __str_cache) return __str_cache[str]\n"); + putstr(" addr = _malloc(length(str) + 1)\n"); + putstr(" unpack_string_to_buf(str, addr)\n"); + putstr(" __str_cache[str] = addr\n"); + putstr(" return addr\n"); + putstr("}\n\n"); +} + +// An implementation of puts, used to replace printf("%s", ...) calls. +bool runtime_use_put_pstr = DEFAULT_USE; +bool runtime_put_pstr_defined = false; +void runtime_put_pstr() { + if (runtime_put_pstr_defined++) return; + putstr("function _put_pstr(addr, c) {\n"); + putstr(" while ((c = _[addr]) != 0) {\n"); + putstr(" printf(\"%c\", c)\n"); + putstr(" addr++\n"); + putstr(" }\n"); + putstr("}\n"); + putstr("\n"); +} + +// Input / output + +bool runtime_use_open = DEFAULT_USE; +bool runtime_open_defined = false; +void runtime_open() { + if (runtime_open_defined++) return; + putstr("function get_pstr(addr, s, c) {\n"); + putstr(" s = \"\"\n"); + putstr(" while ((c = _[addr]) != 0) {\n"); + putstr(" s = s sprintf(\"%c\", c)\n"); + putstr(" addr++\n"); + putstr(" }\n"); + putstr(" return s\n"); + putstr("}\n"); + putstr("\n"); + putstr("function _open(path_ptr, flags, mode, path, fd) {\n"); + putstr(" path = get_pstr(path_ptr)\n"); + putstr(" fd = __next_fd++\n"); + putstr(" __rt_file[fd] = path\n"); + putstr(" if (flags == 1) { # O_WRONLY\n"); + putstr(" __rt_mode[fd] = \"w\"\n"); + putstr(" printf(\"\") > path # Truncate\n"); + putstr(" } else if (flags == 2) { # O_RDWR (not really supported, using append)\n"); + putstr(" __rt_mode[fd] = \"a\"\n"); + putstr(" } else {\n"); + putstr(" __rt_mode[fd] = \"r\"\n"); + putstr(" }\n"); + putstr(" return fd\n"); + putstr("}\n\n"); +} + +bool runtime_use_close = DEFAULT_USE; +bool runtime_close_defined = false; +void runtime_close() { + if (runtime_close_defined++) return; + putstr("function _close(fd) {\n"); + putstr(" if (fd > 2) close(__rt_file[fd])\n"); + putstr(" delete __rt_file[fd]\n"); + putstr(" delete __rt_mode[fd]\n"); + putstr(" delete __fgetc_idx[fd]\n"); + putstr(" delete __fgetc_len[fd]\n"); + putstr(" return 0\n"); + putstr("}\n\n"); +} + +bool runtime_use_write = DEFAULT_USE; +bool runtime_write_defined = false; +void runtime_write() { + if (runtime_write_defined++) return; + putstr("function _write(fd, buf_ptr, count, path, i) {\n"); + putstr(" path = __rt_file[fd]\n"); + putstr(" if (fd == 1) {\n"); + putstr(" for (i = 0; i < count; i++) printf(\"%c\", _[buf_ptr + i])\n"); + putstr(" } else if (fd == 2) {\n"); + putstr(" for (i = 0; i < count; i++) printf(\"%c\", _[buf_ptr + i]) > \"/dev/stderr\"\n"); + putstr(" } else {\n"); + putstr(" for (i = 0; i < count; i++) printf(\"%c\", _[buf_ptr + i]) > path\n"); + putstr(" }\n"); + putstr(" return count\n"); + putstr("}\n\n"); +} + +bool runtime_use_fgetc = DEFAULT_USE; +bool runtime_fgetc_defined = false; +void runtime_fgetc() { + if (runtime_fgetc_defined++) return; + putstr("function _fgetc(fd, path, status, line, i, len) {\n"); + putstr(" if (!(fd in __fgetc_idx) || __fgetc_idx[fd] > __fgetc_len[fd]) {\n"); + putstr(" path = __rt_file[fd]\n"); + putstr(" if (fd == 0) status = getline line\n"); + putstr(" else status = (getline line < path)\n"); + putstr(" if (status < 0) return -1 # Read error\n"); + putstr(" len = length(line)\n"); + putstr(" for (i = 1; i <= len; i++) __fgetc_buf[fd, i] = ord[substr(line, i, 1)]\n"); + putstr(" __fgetc_buf[fd, len + 1] = status >= 1 ? 10 : -1 # Add newline if we read a line successfully, otherwise mark EOF\n"); + putstr(" __fgetc_len[fd] = len + 1\n"); + putstr(" __fgetc_idx[fd] = 1\n"); + putstr(" }\n"); + putstr(" return __fgetc_buf[fd, __fgetc_idx[fd]++]\n"); + putstr("}\n\n"); +} + +bool runtime_use_read = DEFAULT_USE; +bool runtime_read_defined = false; +void runtime_read() { + if (runtime_read_defined++) return; + runtime_use_fgetc = true; // Use _fgetc to read characters one by one + putstr("function _read(fd, buf_ptr, count, c, i) {\n"); + putstr(" for (i = 0; i < count; i++) {\n"); + putstr(" c = _fgetc(fd)\n"); + putstr(" if (c == -1) return i\n"); + putstr(" _[buf_ptr + i] = c\n"); + putstr(" }\n"); + putstr(" return count\n"); + putstr("}\n\n"); +} + +bool runtime_use_fopen = DEFAULT_USE; +bool runtime_fopen_defined = false; +void runtime_fopen() { + if (runtime_fopen_defined++) return; + runtime_open(); + putstr("function _fopen(path_ptr, mode_ptr, mode_str) {\n"); + putstr(" mode_str = get_pstr(mode_ptr)\n"); + putstr(" return _open(path_ptr, (mode_str == \"w\" ? 1 : 0), 0)\n"); + putstr("}\n\n"); +} + +bool runtime_use_fclose = DEFAULT_USE; +bool runtime_fclose_defined = false; +void runtime_fclose() { + if (runtime_fclose_defined++) return; + runtime_close(); + putstr("function _fclose(fd) {\n"); + putstr(" return _close(fd)\n"); + putstr("}\n\n"); +} + +#ifdef AWK_INLINE_PUTCHAR + +bool runtime_use_putchar = DEFAULT_USE; +bool runtime_putchar_defined = false; +void runtime_putchar() { + if (runtime_putchar_defined++) return; + putstr("function _putchar(c) {\n"); + putstr(" printf(\"%c\", c)\n"); + putstr(" return 0\n"); + putstr("}\n\n"); +} + +#endif // AWK_INLINE_PUTCHAR + +// other stubs + +bool runtime_use_make_argv = DEFAULT_USE; +bool runtime_make_argv_defined = false; +void runtime_make_argv() { + if (runtime_make_argv_defined++) return; + runtime_malloc(); + runtime_defstr(); + putstr("function make_argv( i, argv_ptr) {\n"); + putstr(" argv_ptr = _malloc(ARGC)\n"); + putstr(" for (i = 0; i < ARGC; i++) {\n"); + putstr(" _[argv_ptr + i] = defstr(ARGV[i])\n"); + putstr(" }\n"); + putstr(" return argv_ptr\n"); + putstr("}\n\n"); +} + +bool runtime_use_exit = DEFAULT_USE; +bool runtime_exit_defined = false; +void runtime_exit() { + if (runtime_exit_defined++) return; + putstr("function _exit(status) {\n"); + putstr(" exit status\n"); + putstr("}\n\n"); +} + +#ifndef MINIMAL_RUNTIME + +bool runtime_use_getchar = DEFAULT_USE; +bool runtime_getchar_defined = false; +void runtime_getchar() { + if (runtime_getchar_defined++) return; + runtime_use_fgetc = true; + putstr("function _getchar() {\n"); + putstr(" return _fgetc(0)\n"); + putstr("}\n\n"); +} + +#endif + +bool runtime_use_isatty = DEFAULT_USE; +bool runtime_isatty_defined = false; +void runtime_isatty() { + if (runtime_isatty_defined++) return; + putstr("function _isatty(fd) {\n"); + putstr(" return 0\n"); + putstr("}\n\n"); +} + +void produce_runtime() { + if (runtime_use_and) runtime_and(); + if (runtime_use_or) runtime_or(); + if (runtime_use_xor) runtime_xor(); + if (runtime_use_compl) runtime_compl(); + if (runtime_use_lshift) runtime_lshift(); + if (runtime_use_rshift) runtime_rshift(); + if (runtime_use_comma) runtime_comma(); + + if (runtime_use_malloc) runtime_malloc(); + if (runtime_use_free) runtime_free(); + if (runtime_use_defstr) runtime_defstr(); + if (runtime_use_put_pstr) runtime_put_pstr(); + if (runtime_use_open) runtime_open(); + if (runtime_use_close) runtime_close(); + if (runtime_use_read) runtime_read(); + if (runtime_use_write) runtime_write(); + if (runtime_use_fopen) runtime_fopen(); + if (runtime_use_fclose) runtime_fclose(); + if (runtime_use_fgetc) runtime_fgetc(); + if (runtime_use_make_argv) runtime_make_argv(); + +#ifdef AWK_INLINE_PUTCHAR + if (runtime_use_putchar) runtime_putchar(); +#endif +#ifdef AWK_INLINE_EXIT + if (runtime_use_exit) runtime_exit(); +#endif + +#ifndef MINIMAL_RUNTIME + if (runtime_use_getchar) runtime_getchar(); +#endif +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) + if (runtime_use_isatty) runtime_isatty(); +#endif +} diff --git a/awk.c b/awk.c new file mode 100644 index 00000000..55bab365 --- /dev/null +++ b/awk.c @@ -0,0 +1,1374 @@ +// AWK codegen + +#include "awk-runtime.c" + +// Memory stats + +#ifdef PRINT_MEMORY_STATS +int max_text_alloc = 0; +int cumul_text_alloc = 0; +#endif + +// codegen + +// Rope-like text representation +#include "text.c" + +// Environment tracking +#include "env.c" + +// Environment tracking +#include "glo_decls.c" + +// Codegen context +bool main_defined = false; // If the main function is defined +bool init_block_open = false; // If we're inside an initialization block +int init_block_id = 0; // Identifier of the current initialization block + +// Place prototype of mutually recursive functions here + +typedef enum STMT_CTX { + // Default context + STMT_CTX_DEFAULT = 0, + // Indicates that the parent statement was a else statement so that if + // statement uses elif instead of if. + STMT_CTX_ELSE_IF = 1, + // Indicates that we are in a switch statement where breaks mean the end of + // the conditional block. + STMT_CTX_SWITCH = 2, +} STMT_CTX; + +#define comp_rvalue(node) comp_rvalue_go((node), 0) +text comp_rvalue_go(ast node, int outer_op); +text comp_fun_call(ast node, ast params); +bool comp_body(ast node, STMT_CTX stmt_ctx); +bool comp_statement(ast node, STMT_CTX stmt_ctx); +void mark_mutable_variables_body(ast node); +void handle_enum_struct_union_type_decl(ast node); +ast handle_side_effects_go(ast node, bool executes_conditionally); + +// AWK-specific output functions +void print_awk_shebang() { + putstr("#!/usr/bin/awk -f\n"); +} + +void print_awk_comment(char *comment) { + putchar('#'); + putchar(' '); + print_text(wrap_str_lit(comment)); + putchar('\n'); +} + +void add_var_to_local_env(ast decl, enum BINDING kind) { + int ident_symbol = get_val_(IDENTIFIER, get_child__(DECL, IDENTIFIER, decl, 0)); + + // Make sure we're not shadowing an existing local variable + if (cgc_lookup_var(ident_symbol, cgc_locals)) { + dump_ident(ident_symbol); + fatal_error("Local variable shadowing is not supported."); + } + + // The var is not part of the environment, so we add it. + cgc_add_local_var(kind, ident_symbol, get_child_(DECL, decl, 1)); +} + +text global_var(int ident_symbol) { + return string_concat(wrap_char('_'), wrap_str_pool(ident_symbol)); +} + +text local_var(int ident_symbol) { + return wrap_str_pool(ident_symbol); +} + +text env_var(ast ident) { + int binding; + int ident_symbol = get_val_(IDENTIFIER, ident); + if ((binding = cgc_lookup_var(ident_symbol, cgc_locals))) { + return local_var(ident_symbol); + } else { + return global_var(ident_symbol); + } +} + +#ifdef SUPPORT_STRUCT_UNION + +text struct_member_var(ast member_name_ident) { + return string_concat(wrap_str_lit("__"), wrap_str_pool(get_val_(IDENTIFIER, member_name_ident))); +} + +text struct_sizeof_var(ast struct_name_ident) { + return string_concat(wrap_str_lit("__sizeof__"), wrap_str_pool(get_val_(IDENTIFIER, struct_name_ident))); +} + +#endif + +text function_name(int ident_tok) { + return string_concat(wrap_char('_'), wrap_str_pool(ident_tok)); +} + +#ifdef AWK_SUPPORT_ADDRESS_OF + +// Unlike in the native backend, there are 2 ways to compile a lvalue. +// +// The first (comp_lvalue) returns the variable that represent the memory +// location, this is useful when we're assigning to the lvalue. +// The second (comp_lvalue_address) produces the address of the memory location. +// This is mostly used to implement &. +// +// This difference is important as local variables don't have a memory location +// so we can't take their address and so their lvalue is just their name. +text comp_lvalue_address(ast node) { + int op = get_op(node); + text sub1; + text sub2; + + if (op == IDENTIFIER) { + // This is currently not supported because we treat as globals the enums + // and other hardcoded constants which is not what we want. + // + // We need to integrate the bindings local used in the exe backend here so + // we can know more about variables other than "it's local" and "it's not + // local so it must be global". + fatal_error("comp_rvalue_go: can't take the address of variable"); + return 0; + } else if (op == '[') { + sub1 = comp_rvalue(get_child_('[', node, 0)); + sub2 = comp_rvalue(get_child_('[', node, 1)); + return string_concat3(sub1, wrap_str_lit(" + "), sub2); + } else if (op == '*') { + return comp_rvalue(get_child_('*', node, 0)); + } +#ifdef SUPPORT_STRUCT_UNION + else if (op == ARROW) { + sub1 = comp_rvalue(get_child_(ARROW, node, 0)); + sub2 = struct_member_var(get_child_(ARROW, node, 1)); + return string_concat3(sub1, wrap_str_lit(" + "), sub2); + } +#endif + else if (op == CAST) { + return comp_lvalue_address(get_child_(CAST, node, 1)); + } else { + dump_node(node); + fatal_error("comp_lvalue_address: unknown lvalue"); + return 0; + } +} + +#endif + +text comp_lvalue(ast node) { + int op = get_op(node); + text sub1; + text sub2; + + if (op == IDENTIFIER) { + return env_var(node); + } else if (op == '[') { + sub1 = comp_rvalue(get_child_('[', node, 0)); + sub2 = comp_rvalue(get_child_('[', node, 1)); + return string_concat5(wrap_str_lit("_["), sub1, wrap_str_lit(" + "), sub2, wrap_str_lit("]")); + } else if (op == '*') { + sub1 = comp_rvalue(get_child_('*', node, 0)); + return string_concat3(wrap_str_lit("_[ "), sub1, wrap_str_lit(" ]")); + } +#ifdef SUPPORT_STRUCT_UNION + else if (op == ARROW) { + sub1 = comp_rvalue(get_child_(ARROW, node, 0)); + sub2 = struct_member_var(get_child_(ARROW, node, 1)); + return string_concat5(wrap_str_lit("_["), sub1, wrap_str_lit(" + "), sub2, wrap_str_lit("]")); + } +#endif + else if (op == CAST) { + return comp_lvalue(get_child_(CAST, node, 1)); + } else { + dump_node(node); + fatal_error("comp_lvalue: unknown lvalue"); + return 0; + } +} + +text op_to_str(int op) { + if (32 < op && op < 127) return string_concat3(wrap_char(' '), wrap_char(op), wrap_char(' ')); + else if (op == AMP_AMP) return wrap_str_lit(" && "); + else if (op == AMP_EQ) return wrap_str_lit(" &= "); + else if (op == BAR_BAR) return wrap_str_lit(" || "); + else if (op == BAR_EQ) return wrap_str_lit(" |= "); + else if (op == CARET_EQ) return wrap_str_lit(" ^= "); + else if (op == EQ_EQ) return wrap_str_lit(" == "); + else if (op == GT_EQ) return wrap_str_lit(" >= "); + else if (op == LSHIFT_EQ) return wrap_str_lit(" <<= "); + else if (op == LT_EQ) return wrap_str_lit(" <= "); + else if (op == MINUS_EQ) return wrap_str_lit(" -= "); + else if (op == EXCL_EQ) return wrap_str_lit(" != "); + else if (op == PERCENT_EQ) return wrap_str_lit(" %= "); + else if (op == PLUS_EQ) return wrap_str_lit(" += "); + else if (op == RSHIFT_EQ) return wrap_str_lit(" >>= "); + else if (op == SLASH_EQ) return wrap_str_lit(" /= "); + else if (op == STAR_EQ) return wrap_str_lit(" *= "); + else { + dump_op(op); + fatal_error("op_to_str: unexpected operator"); + return 0; + } +} + +// '&' || op == '|' || op == '^' || op == LSHIFT || op == RSHIFT +text function_op_to_str(int op) { + if (op == '&' || op == AMP_EQ) { + runtime_use_and = true; + return wrap_str_lit("and"); + } else if (op == '|' || op == BAR_EQ) { + runtime_use_or = true; + return wrap_str_lit("or"); + } else if (op == '^' || op == CARET_EQ) { + runtime_use_xor = true; + return wrap_str_lit("xor"); + } else if (op == '~') { + runtime_use_compl = true; + return wrap_str_lit("compl"); + } else if (op == LSHIFT || op == LSHIFT_EQ) { + runtime_use_lshift = true; + return wrap_str_lit("lshift"); + } else if (op == RSHIFT || op == RSHIFT_EQ) { + runtime_use_rshift = true; + return wrap_str_lit("rshift"); + } else if (op == ',') { + runtime_use_comma = true; + return wrap_str_lit("comma"); + } else { + dump_op(op); + fatal_error("function_op_to_str: unexpected operator"); + return 0; + } +} + +// Return true if the operator is associative. +// Associative operators can be chained without parentheses. +bool is_associative_operator(int op) { + return (op == '+') | (op == '*') | (op == '&') | (op == '|') | (op == '^') + | (op == EQ_EQ) | (op == AMP_AMP) | (op == BAR_BAR); +} + +text wrap_if_needed(text code, int outer_op, int inner_op) { + // Rough heuristic to determine if we need to wrap in parentheses. If we + // wanted to do this right, we'd track the left and right operators and + // use this information to determine if parentheses are needed. + if ( outer_op != 0 + && outer_op != '=' // Assignment has the lowest precedence so we never use parentheses + && (!is_associative_operator(inner_op) || inner_op != outer_op) // Adjacent associative operations don't need parentheses + ) { + return string_concat3(wrap_char('('), code, wrap_char(')')); + } else { + return code; + } +} + +text comp_assignment(ast lvalue, ast rvalue) { + text code_lvalue = comp_lvalue(lvalue); + text code_rvalue = comp_rvalue(rvalue); + return string_concat3(code_lvalue, wrap_str_lit(" = "), code_rvalue); +} + +text comp_rvalue_go(ast node, int outer_op) { + if (node == 0) return 0; + int op = get_op(node); + int nb_children = get_nb_children(node); + text sub1, sub2, sub3; + ast child0, child1, child2; + + if (nb_children >= 1) { child0 = get_child(node, 0); } + if (nb_children >= 2) { child1 = get_child(node, 1); } + if (nb_children >= 3) { child2 = get_child(node, 2); } + + if (nb_children == 0) { + if (op == INTEGER +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + || op == INTEGER_HEX || op == INTEGER_OCT +#endif + ) { + return wrap_integer(1, node); + } + else if (op == CHARACTER) { + // For characters, return ord["c"]: + return string_concat3(wrap_str_lit("ord[\""), escape_text(wrap_char(get_val_(CHARACTER, node)), false), wrap_str_lit("\"]")); + } else if (op == STRING) { + // For string, call defstr("...") to define the string and return its identifier + runtime_use_defstr = true; + return string_concat3(wrap_str_lit("defstr(\""), escape_text(wrap_str_pool(get_val_(STRING, node)), false), wrap_str_lit("\")")); + } else if (op == IDENTIFIER) { + return env_var(node); + } else { + dump_node(node); + fatal_error("comp_rvalue_go: unexpected operator"); + return 0; + } + } else if (nb_children == 1) { + if (op == '*') { + sub1 = comp_rvalue_go(child0, op); + return string_concat3(wrap_str_lit("_["), sub1, wrap_str_lit("]")); + } else if (op == '+') { + // +x is equivalent to x + return comp_rvalue_go(child0, outer_op); + } else if (op == '-' || op == '!') { + if (op == '-' && (get_op(child0) == INTEGER +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + || op == INTEGER_HEX || op == INTEGER_OCT +#endif + ) + ) { + return wrap_integer(-1, child0); + } else { + sub1 = comp_rvalue_go(child0, op); + return string_concat(wrap_char(op), sub1); + } + } else if (op == '~') { + // Complement operator is a function in AWK + sub1 = comp_rvalue_go(child0, op); + return string_concat3(string_concat(function_op_to_str(op), wrap_char('(')), sub1, wrap_char(')')); + } else if (op == MINUS_MINUS_PRE) { + sub1 = comp_lvalue(child0); + return string_concat(wrap_str_lit("--"), sub1); + } else if (op == PLUS_PLUS_PRE) { + sub1 = comp_lvalue(child0); + return string_concat(wrap_str_lit("++"), sub1); + } else if (op == MINUS_MINUS_POST) { + sub1 = comp_lvalue(child0); + return string_concat(sub1, wrap_str_lit("--")); + } else if (op == PLUS_PLUS_POST) { + sub1 = comp_lvalue(child0); + return string_concat(sub1, wrap_str_lit("++")); + } +#ifdef SUPPORT_SIZEOF + else if (op == SIZEOF_KW) { + // child0 is either an abstract declaration or an expression + if (get_op(child0) == DECL) { + child0 = get_child_(DECL, child0, 1); // Get the type + switch (get_op(child0)) { + case INT_KW: + case SHORT_KW: + case LONG_KW: + case CHAR_KW: + case VOID_KW: + case ENUM_KW: + case '*': // If it's a pointer + return wrap_int(1); + +#ifdef SUPPORT_STRUCT_UNION + case STRUCT_KW: + return struct_sizeof_var(get_child__(STRUCT_KW, IDENTIFIER, child0, 1)); +#endif + default: + dump_node(child0); + dump_node(get_child(child0, 1)); + fatal_error("comp_rvalue_go: sizeof is not supported for this type or expression"); + return 0; + } + } else { + dump_node(child0); + fatal_error("comp_rvalue_go: sizeof is not supported for this type or expression"); + return 0; + } + } +#endif // SUPPORT_SIZEOF +#ifdef AWK_SUPPORT_ADDRESS_OF + else if (op == '&') { + return comp_lvalue_address(child0); + } +#endif + else { + dump_node(node); + fatal_error("comp_rvalue_go: unexpected operator"); + return 0; + } + } else if (nb_children == 2) { + if (op == '+' || op == '-' || op == '*' || op == '/' || op == '%') { // TODO: op == ',' + sub1 = comp_rvalue_go(child0, op); + sub2 = comp_rvalue_go(child1, op); + sub2 = string_concat3(sub1, op_to_str(op), sub2); + if (op == '/') { + // Division in AWK is floating point, so we need to cast to int + sub2 = string_concat3(wrap_str_lit("int("), sub2, wrap_char(')')); + } + return wrap_if_needed(sub2, outer_op, op); + } else if (op == '&' || op == '|' || op == '^' || op == LSHIFT || op == RSHIFT || op == ',') { + // These operators are functions in AWK + sub1 = comp_rvalue_go(child0, op); + sub2 = comp_rvalue_go(child1, op); + return wrap_if_needed( + string_concat( + function_op_to_str(op), + string_concat5(wrap_char('('), sub1, wrap_str_lit(", "), sub2, wrap_char(')'))), outer_op, op); + } else if (op == '=' || op == MINUS_EQ || op == PERCENT_EQ || op == PLUS_EQ || op == SLASH_EQ || op == STAR_EQ) { + sub1 = comp_lvalue(child0); + sub2 = comp_rvalue_go(child1, op); + return wrap_if_needed(string_concat3(sub1, op_to_str(op), sub2), outer_op, op); + } else if (op == AMP_EQ || op == BAR_EQ || op == CARET_EQ || op == LSHIFT_EQ || op == RSHIFT_EQ) { + sub1 = comp_lvalue(child0); + sub2 = comp_rvalue_go(child1, op); + return wrap_if_needed( + string_concat( + string_concat3(sub1, wrap_str_lit(" = "), function_op_to_str(op)), + string_concat5(wrap_char('('), sub1, wrap_str_lit(", "), sub2, wrap_char(')'))), + outer_op, '='); + } else if (op == '[') { // array indexing + sub1 = comp_rvalue_go(child0, '+'); + sub2 = comp_rvalue_go(child1, '+'); + return string_concat5(wrap_str_lit("_["), sub1, wrap_str_lit(" + "), sub2, wrap_str_lit("]")); + } +#ifdef SUPPORT_STRUCT_UNION + else if (op == ARROW) { // member access is implemented like array access + sub1 = comp_rvalue_go(child0, op); + sub2 = struct_member_var(child1); + return string_concat5(wrap_str_lit("_[ "), sub1, wrap_str_lit(" + "), sub2, wrap_str_lit(" ]")); + } +#endif + else if (op == EQ_EQ || op == EXCL_EQ || op == LT_EQ || op == GT_EQ || op == '<' || op == '>') { + sub1 = comp_rvalue_go(child0, op); + sub2 = comp_rvalue_go(child1, op); + return wrap_if_needed(string_concat3(sub1, op_to_str(op), sub2), outer_op, op); + } else if (op == CAST) { // Casts are no-op + return comp_rvalue_go(child1, op); + } else if (op == AMP_AMP || op == BAR_BAR) { + sub1 = comp_rvalue_go(child0, op); + sub2 = comp_rvalue_go(child1, op); + return wrap_if_needed(string_concat3(sub1, op_to_str(op), sub2), outer_op, op); + } else if (op == '(') { + return comp_fun_call(child0, child1); + } else { + dump_node(node); + fatal_error("comp_rvalue_go: unknown rvalue"); + return 0; + } + } else if (nb_children == 3) { + if (op == '?') { + sub1 = comp_rvalue_go(child0, op); + sub2 = comp_rvalue_go(child1, op); + sub3 = comp_rvalue_go(child2, op); + return wrap_if_needed(string_concat5(sub1, op_to_str(op), sub2, wrap_str_lit(" : "), sub3), outer_op, op); + } else { + dump_node(node); + fatal_error("comp_rvalue_go: unknown rvalue"); + return 0; + } + } else { + dump_node(node); + fatal_error("comp_rvalue_go: unknown rvalue"); + return 0; + } +} +#if defined(AWK_INLINE_PUTCHAR) || defined(AWK_INLINE_PRINTF) +text comp_putchar_inline(ast param) { + text res; + char c; + + if (get_op(param) == CHARACTER) { + c = get_val_(CHARACTER, param); + if ((c >= 32 && c <= 126) || c == '\n') { // Printable ASCII characters + newline + return string_concat3(wrap_str_lit("printf(\""), escape_text(wrap_char(c), true), wrap_str_lit("\")")); + } + } + + res = comp_rvalue(param); + return string_concat3(wrap_str_lit("printf(\"%c\", "), res, wrap_char(')')); +} +#endif + +#ifdef AWK_INLINE_PRINTF +// format_str is from the string pool so immutable +text printf_call(char *format_str, char *format_str_end, text params_text, bool escape) { + if (format_str == format_str_end) { + return 0; + } else { + return string_concat3(wrap_str_lit("printf("), + concatenate_strings_with( + string_concat3(wrap_char('"'), escape_text(wrap_str_imm(format_str, format_str_end), escape), wrap_char('"')), + params_text, + wrap_str_lit(", ")), + wrap_str_lit(")")); + } +} + +enum PRINTF_STATE { + PRINTF_STATE_FLAGS, + PRINTF_STATE_WIDTH, + PRINTF_STATE_PRECISION, + PRINTF_STATE_SPECIFIER +}; + +// _printf pulls a lot of dependencies from the runtime. In most cases the +// format string is known at compile time, and we can avoid calling printf by +// using the shell's printf instead. This function generates a sequence of shell +// printf and put_pstr equivalent to the given printf call. +void handle_printf_call(char *format_str, ast params) { + ast param = 0; // Next parameter, if any + char *format_start = format_str; + char *specifier_start; + // compiled parameters to be passed to printf + text params_text = 0, width_text = 0, precision_text = 0; + + bool mod = false; + bool has_width = false; + bool has_precision = false; + + enum PRINTF_STATE state = PRINTF_STATE_FLAGS; + + while (*format_str != '\0') { + // Param is consumed, get the next one + if (param == 0 && params != 0) { + param = car(params); + params = tail(params); + } + + if (mod) { + switch (*format_str) { + case ' ': case '#': case '+': case '-': case '0': // Flags + // Flags correspond to 0x20,0x23,0x2b,0x2d,0x30 which are spread over + // 16 bits meaning we can easily convert char -> bit if we wanted to. + if (state != PRINTF_STATE_FLAGS) fatal_error("printf: flags must come before width and precision"); + break; + + // Width or precision literal + case '1': case '2': case '3': + case '4': case '5': case '6': + case '7': case '8': case '9': + if (state != PRINTF_STATE_FLAGS && state != PRINTF_STATE_PRECISION) fatal_error("printf: width or precision already specified"); + while ('0' <= *format_str && *format_str <= '9') format_str += 1; // Skip the rest of the number + has_width = state == PRINTF_STATE_FLAGS ? true : has_width; + has_precision = state == PRINTF_STATE_PRECISION ? true : has_precision; + state += 1; // Move to the next state (PRINTF_STATE_FLAGS => PRINTF_STATE_WIDTH, PRINTF_STATE_PRECISION => PRINTF_STATE_SPECIFIER) + format_str -= 1; // Reprocess non-numeric character + break; + + // Precision + case '.': + if (state >= PRINTF_STATE_PRECISION) fatal_error("printf: precision already specified"); + state = PRINTF_STATE_PRECISION; + break; + + case '*': + if (param == 0) fatal_error("printf: not enough parameters"); + if (state == PRINTF_STATE_FLAGS) { + width_text = comp_rvalue(param); + has_width = true; + } else if (state == PRINTF_STATE_PRECISION) { + precision_text = comp_rvalue(param); + has_precision = true; + } else { + fatal_error("printf: width or precision already specified"); + } + param = 0; + break; + + case '%': + if (state != PRINTF_STATE_FLAGS) fatal_error("printf: cannot use flags, width or precision with %%"); + mod = false; + break; + + // The following options are the same between the shell's printf and C's printf + case 'l': case 'd': case 'i': case 'o': case 'u': case 'x': case 'X': case 'c': + if (*format_str == 'l') { + while (*format_str == 'l') format_str += 1; // Skip the 'l' for long + if (*format_str != 'd' && *format_str != 'i' && *format_str != 'o' && *format_str != 'u' && *format_str != 'x' && *format_str != 'X') { + dump_string("format_str = ", specifier_start); + fatal_error("printf: unsupported format specifier"); + } + } + + if (param == 0) fatal_error("printf: not enough parameters"); + params_text = concatenate_strings_with(params_text, width_text, wrap_str_lit(", ")); // Add width param if needed + params_text = concatenate_strings_with(params_text, precision_text, wrap_str_lit(", ")); // Add precision param if needed + params_text = concatenate_strings_with(params_text, comp_rvalue(param), wrap_str_lit(", ")); // Add the parameter + param = 0; // Consume param + mod = false; + break; + + // We can't a string to printf directly, it needs to be unpacked first. + case 's': + if (param == 0) fatal_error("printf: not enough parameters"); + runtime_use_put_pstr = true; + // If the format specifier has width or precision, we have to pack the string and call then printf. + // Otherwise, we can call _put_pstr directly and avoid the subshell. + if (has_width || has_precision) { + fatal_error("printf: width and precision for strings not supported"); + } else { + // Generate printf call with what we have so far + append_glo_decl(printf_call(format_start, specifier_start, params_text, false)); + // New format string starts after the % + format_start = format_str + 1; + // Compile printf("...%s...", str) to _put_pstr str + append_glo_decl(string_concat3(wrap_str_lit("_put_pstr("), comp_rvalue(param), wrap_char(')'))); + } + param = 0; // Consume param + mod = false; + break; + + default: + dump_string("format_str = ", specifier_start); + fatal_error("printf: unsupported format specifier"); + } + } else if (*format_str == '%') { + mod = true; + specifier_start = format_str; + // Reset the state machine + width_text = precision_text = has_width = has_precision = 0; + state = PRINTF_STATE_FLAGS; + } + + // Keep accumulating the format string + format_str += 1; + } + + // Dump the remaining format string + append_glo_decl(printf_call(format_start, format_str, params_text, false)); +} +#endif + +text comp_fun_call(ast name, ast params) { + if (get_op(name) != IDENTIFIER) { + dump_node(name); + fatal_error("comp_rvalue_go: function name must be an identifier"); + } + int name_id = get_val_(IDENTIFIER, name); + text code_params = 0; + ast param; + +#ifdef AWK_INLINE_PRINTF + if (((name_id == PUTS_ID || name_id == PUTSTR_ID || name_id == PRINTF_ID) + && (param = list_singleton(params)) != 0 + && get_op(param) == STRING)) { // puts("..."), putstr("..."), printf("...") + return printf_call(symbol_buf(get_val_(STRING, param)), 0, 0, true); + } else if (name_id == PRINTF_ID && params != 0 && get_op(car(params)) == STRING) { // printf("...", ...) + handle_printf_call(symbol_buf(get_val_(STRING, car(params))), tail(params)); + return 0; + } +#ifdef AWK_INLINE_PUTCHAR + else if (name_id == PUTCHAR_ID && (param = list_singleton(params)) != 0) { // putchar with 1 param + return comp_putchar_inline(param); + } +#endif +#ifdef AWK_INLINE_EXIT + else if (name_id == EXIT_ID && (param = list_singleton(params)) != 0) { // exit with 1 param + return string_concat3(wrap_str_lit("exit("), comp_rvalue(param), wrap_str_lit(")")); + } +#endif +#endif + + if (name_id == MALLOC_ID) { runtime_use_malloc = true; } + else if (name_id == FREE_ID) { runtime_use_free = true; } + else if (name_id == FOPEN_ID) { runtime_use_fopen = true; } + else if (name_id == FCLOSE_ID) { runtime_use_fclose = true; } + else if (name_id == FGETC_ID) { runtime_use_fgetc = true; } + else if (name_id == READ_ID) { runtime_use_read = true; } + else if (name_id == WRITE_ID) { runtime_use_write = true; } + else if (name_id == OPEN_ID) { runtime_use_open = true; } + else if (name_id == CLOSE_ID) { runtime_use_close = true; } +#ifndef AWK_INLINE_PUTCHAR + else if (name_id == PUTCHAR_ID) { runtime_use_putchar = true; } +#endif +#ifndef AWK_INLINE_EXIT + else if (name_id == EXIT_ID) { runtime_use_exit = true; } +#endif +#ifndef MINIMAL_RUNTIME + else if (name_id == GETCHAR_ID) { runtime_use_getchar = true; } +#endif +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) + else if (name_id == ISATTY_ID) { runtime_use_isatty = true; } +#endif + + while (params != 0) { + code_params = concatenate_strings_with(code_params, comp_rvalue(car(params)), wrap_str_lit(", ")); + params = tail(params); + } + + return string_concat4( function_name(get_val_(IDENTIFIER, name)) + , wrap_char('(') + , code_params + , wrap_char(')')); +} + +bool comp_body(ast node, STMT_CTX stmt_ctx) { + int start_cgc_locals = cgc_locals; + + while (node != 0) { + if (comp_statement(get_child_('{', node, 0), stmt_ctx)) break; // Statement always returns => block is terminated + node = get_child_('{', node, 1); + } + + cgc_locals = start_cgc_locals; + return node != 0; // If node is not null, it means the block was terminated early +} + + +// Assemble switch pattern from case and default statements. +// Case and default statements are like labelled statements, meaning that they +// wrap the next statement. This function unwraps the next statements until a +// non-case statement is found. +// Because the non-case statement must be compiled as well, it is returned via +// the last_stmt global variable. +ast last_stmt; +text make_switch_pattern(ast statement, text scrutinee_text) { + text str = 0; + + while (1) { // statement will never be null + switch (get_op(statement)) { + case DEFAULT_KW: + str = wrap_int(1); // Default case always matches + statement = get_child_(DEFAULT_KW, statement, 0); + break; + + case CASE_KW: + // This is much more permissive than what a C compiler would allow, + // but Shell allows matching on arbitrary expression in case + // patterns so it's fine. If we wanted to do this right, we'd check + // that the pattern is a numeric literal or an enum identifier. + str = concatenate_strings_with(str, + string_concat3(comp_rvalue(get_child_(CASE_KW, statement, 0)), wrap_str_lit(" == "), scrutinee_text), + wrap_str_lit(" || ")); + statement = get_child_(CASE_KW, statement, 1); + break; + + default: + if (str == 0) fatal_error("Expected case in switch. Fallthrough is not supported."); + last_stmt = statement; + return str; + } + } +} + +// Warning: because pnut-awk doesn't support temporary variables, it uses a +// hardcoded global variable __scrutinee to hold the scrutinee value when +// needed. I _think_ this is fine for nested and reentrant switch statements +// since the scrutinee value is only needed to dispatch to the correct case, +// and can be overwritten in the body of the case statement (where nested switches +// and reentrancy would happen). +bool comp_switch(ast node) { + ast statement; + int start_cgc_locals = cgc_locals; + bool first_case = true; // Whether we're compiling the first case/default statement or subsequent ones + + text scrutinee_text = comp_rvalue(get_child_(SWITCH_KW, node, 0)); + switch (get_op(get_child_(SWITCH_KW, node, 0))) { + case IDENTIFIER: + case INTEGER: + // For "atomic" scrutinees, use them directly in the comparisons + break; + default: + // Otherwise, evaluate the scrutinee into a temporary variable + append_glo_decl(string_concat(wrap_str_lit("__scrutinee = "), scrutinee_text)); + scrutinee_text = wrap_str_lit("__scrutinee"); + } + + cgc_add_enclosing_switch(false); + + nest_level += 1; + + node = get_child_(SWITCH_KW, node, 1); + + if(get_op(node) == CASE_KW) { + // This is for the edge case where the entire 'statement' part of < switch ( expression ) statement > + // is a single < case constant-expression : statement > + // therefore we wrap the case statement with a block statement to simplify down to the typical syntax + node = new_ast2('{', node, 0); + } + + if (node == 0 || get_op(node) != '{') fatal_error("comp_statement: switch without body"); + while (get_op(node) == '{') { + statement = get_child_('{', node, 0); + node = get_child_('{', node, 1); + + append_glo_decl(string_concat3( + wrap_str_lit(first_case ? "if (" : "} else if ("), + make_switch_pattern(statement, scrutinee_text), + wrap_str_lit(") {") + )); + first_case = false; + statement = last_stmt; // last_stmt is set by make_switch_pattern + + nest_level += 1; + + // We keep compiling statements until we encounter a statement that returns or breaks. + // Case and default nodes contain the first statement of the block so we process that one first. + if (!comp_statement(statement, STMT_CTX_SWITCH)) { + while (get_op(node) == '{') { + statement = get_child_('{', node, 0); + node = get_child_('{', node, 1); + if (comp_statement(statement, STMT_CTX_SWITCH)) break; + } + } + + nest_level -= 1; + } + + nest_level -= 1; + append_glo_decl(wrap_str_lit("}")); // End of emulated case statement + + cgc_locals = start_cgc_locals; + + return false; +} + +bool comp_if(ast node, STMT_CTX stmt_ctx) { + int start_glo_decl_idx; + bool termination_lhs = false; + bool termination_rhs = false; + int start_cgc_locals = cgc_locals; + + bool else_if = stmt_ctx & STMT_CTX_ELSE_IF; + stmt_ctx = stmt_ctx & ~STMT_CTX_ELSE_IF; // Clear STMT_CTX_ELSE_IF bit to not pass it to the next if statement + + append_glo_decl(string_concat3( + wrap_str_lit(else_if ? "} else if (" : "if ("), + comp_rvalue(get_child_(IF_KW, node, 0)), + wrap_str_lit(") {") + )); + + nest_level += 1; + start_glo_decl_idx = glo_decl_ix; + termination_lhs = comp_statement(get_child_(IF_KW, node, 1), stmt_ctx); + nest_level -= 1; + + if (get_child_(IF_KW, node, 2) != 0) { + // Compile sequence of if else if using elif + if (get_op(get_child_(IF_KW, node, 2)) == IF_KW) { + termination_rhs = comp_if(get_child_(IF_KW, node, 2), stmt_ctx | STMT_CTX_ELSE_IF); // STMT_CTX_ELSE_IF => next if stmt will use elif + } else { + append_glo_decl(wrap_str_lit("} else {")); + nest_level += 1; + start_glo_decl_idx = glo_decl_ix; + termination_rhs = comp_statement(get_child_(IF_KW, node, 2), stmt_ctx & ~STMT_CTX_ELSE_IF); // Clear STMT_CTX_ELSE_IF bit + if (!any_active_glo_decls(start_glo_decl_idx)) append_glo_decl(wrap_char(':')); + nest_level -= 1; + } + } + if (!else_if) append_glo_decl(wrap_str_lit("}")); + + if (stmt_ctx & STMT_CTX_SWITCH && termination_lhs ^ termination_rhs) { + fatal_error("Early break out of a switch case is unsupported"); + } + + cgc_locals = start_cgc_locals; + + return termination_lhs && termination_rhs; +} + +bool comp_break() { + int binding = cgc_lookup_enclosing_loop_or_switch(cgc_locals); + if (binding == 0) fatal_error("comp_statement: break not in loop or switch"); + if (binding_kind(binding) == BINDING_LOOP) { + append_glo_decl(wrap_str_lit("break")); + } + return true; +} + +bool comp_continue() { + int binding = cgc_lookup_enclosing_loop(cgc_locals); + if (binding == 0) fatal_error("comp_statement: continue not in loop"); + // We could remove the continue when in tail position, but it's not worth doing + append_glo_decl(wrap_str_lit("continue")); + return false; +} + +bool comp_return(ast return_value) { + if (return_value != 0) { + append_glo_decl(string_concat(wrap_str_lit("return "), comp_rvalue(return_value))); + } else { + append_glo_decl(wrap_str_lit("return")); + } + return true; +} + +// Since global and internal variables are prefixed with _, we restrict the name +// of variables to not start with _. +// Also, the AWK backend doesn't support variables with aggregate types. +void assert_var_decl_is_safe(ast variable, bool local) { // Helper function for assert_idents_are_safe + ast ident_symbol = get_val_(IDENTIFIER, get_child__(DECL, IDENTIFIER, variable, 0)); + char* name = symbol_buf(ident_symbol); + ast type = get_child_(DECL, variable, 1); + if (name[0] == '_') { // Underscore is used to prefix global and internal variables + dump_string("Variable name: ", name); + fatal_error("variable name is invalid. It can't start with '_'."); + } + + // FIXME: AWK has special variables that can't be used as regular variables. + + if (local) { + // Local variables don't correspond to memory locations, and can't store more than 1 number/pointer. + if (get_op(type) == '[' +#ifdef SUPPORT_STRUCT_UNION + || get_op(type) == STRUCT_KW +#endif + ) { + dump_string("Variable name: ", name); + fatal_error("local array/struct value type is not supported for AWK backend. Use a reference type instead."); + } + } else { + // Arrays of structs and struct value types are not supported for now. + // When we have type information on the local and global variables, we'll + // be able to generate the correct code for these cases. + if ( (get_op(type) == '[' && get_op(get_child_('[', type, 0)) == '[') // Array of arrays +#ifdef SUPPORT_STRUCT_UNION + || (get_op(type) == '[' && get_op(get_child_('[', type, 0)) == STRUCT_KW) // Array of structs + || get_op(type) == STRUCT_KW // Struct value type +#endif + ) { + dump_string("Variable name: ", name); + fatal_error("global array of struct and struct value type are not supported in AWK backend. Use a reference type instead."); + } + } +} + +void comp_var_decls(ast node) { + ast var_decl; + +#ifdef SUPPORT_TYPE_SPECIFIERS + switch (get_child_(DECLS, node, 1)) { + // AUTO_KW and REGISTER_KW can simply be ignored. + case EXTERN_KW: + case STATIC_KW: + fatal_error("Extern and static storage class specifier not supported on local variables"); + break; + } +#endif + node = get_child_opt_(DECLS, LIST, node, 0); + while (node != 0) { + // Add to local env and cummulative env, then initialize + var_decl = car_(DECL, node); + assert_var_decl_is_safe(var_decl, true); + add_var_to_local_env(var_decl, BINDING_VAR_LOCAL); + if (get_child_(DECL, var_decl, 2) != 0) { // Initializer + append_glo_decl(comp_assignment(get_child__(DECL, IDENTIFIER, var_decl, 0), get_child_(DECL, var_decl, 2))); + } + node = tail(node); // Next variable + } +} + +// Returns whether the statement always returns/breaks. +// This is used to delimit the end of conditional blocks of switch statements. +bool comp_statement(ast node, STMT_CTX stmt_ctx) { + int op; + text str; + int start_cgc_locals = cgc_locals; + + if (node == 0) return false; // Empty statement never returns + + op = get_op(node); + + if (op == IF_KW) { + return comp_if(node, stmt_ctx); + } else if (op == WHILE_KW) { + cgc_add_enclosing_loop(); + append_glo_decl(string_concat3(wrap_str_lit("while ("), + comp_rvalue(get_child_(WHILE_KW, node, 0)), + wrap_str_lit(") {"))); + nest_level += 1; + comp_statement(get_child_(WHILE_KW, node, 1), stmt_ctx); + nest_level -= 1; + append_glo_decl(wrap_str_lit("}")); + cgc_locals = start_cgc_locals; + return false; +#ifdef SUPPORT_DO_WHILE + } else if (op == DO_KW) { + cgc_add_enclosing_loop(); + append_glo_decl(wrap_str_lit("do {")); + nest_level += 1; + comp_statement(get_child_(DO_KW, node, 0), stmt_ctx); + nest_level -= 1; + append_glo_decl(string_concat3(wrap_str_lit("} while ("), + comp_rvalue(get_child_(DO_KW, node, 1)), + wrap_str_lit(");"))); + cgc_locals = start_cgc_locals; + return false; +#endif + } else if (op == FOR_KW) { + cgc_add_enclosing_loop(); + str = comp_rvalue(get_child_(FOR_KW, node, 0)); + str = string_concat(str, wrap_str_lit("; ")); + str = string_concat(str, comp_rvalue(get_child_(FOR_KW, node, 1))); + str = string_concat(str, wrap_str_lit("; ")); + str = string_concat(str, comp_rvalue(get_child_(FOR_KW, node, 2))); + append_glo_decl(string_concat3(wrap_str_lit("for ("), + str, + wrap_str_lit(") {"))); + nest_level += 1; + comp_statement(get_child_(FOR_KW, node, 3), stmt_ctx); + nest_level -= 1; + append_glo_decl(wrap_str_lit("}")); + cgc_locals = start_cgc_locals; + return false; + } else if (op == SWITCH_KW) { + return comp_switch(node); + } else if (op == BREAK_KW) { + return comp_break(); // Break out of switch statement + } else if (op == CONTINUE_KW) { + return comp_continue(); // Continue to next iteration of loop + } else if (op == RETURN_KW) { + return comp_return(get_child_(RETURN_KW, node, 0)); + } else if (op == '(') { // Function call + append_glo_decl(comp_fun_call(get_child_('(', node, 0), get_child_('(', node, 1))); + return false; + } else if (op == '{') { // Compound statement + return comp_body(node, stmt_ctx); +#ifdef SUPPORT_GOTO + } else if (op == ':') { // Labelled statement + // Labelled statement are not very useful as gotos are not supported in the + // AWK backend, but we still emit a label comment for readability. + append_glo_decl(string_concat3(wrap_str_lit("#_ "), wrap_str_pool(get_val_(IDENTIFIER, get_child_(':', node, 0))), wrap_char(':'))); + return comp_statement(get_child_(':', node, 1), stmt_ctx); + } else if (op == GOTO_KW) { + fatal_error("goto statements not supported"); + return false; +#endif + } else if (get_op(node) == CASE_KW || get_op(node) == DEFAULT_KW) { + fatal_error("case/default must be at the beginning of a switch conditional block"); + return false; + } else if (op == DECLS) { + comp_var_decls(node); + return false; + } else { + append_glo_decl(comp_rvalue(node)); + return false; + } +} + +text comp_local_variables() { + // From cgc_locals, generate code to declare local variables as extra function + // parameters. + text params_text = 0; + int env = cgc_locals_fun; + while (env != 0) { + params_text = concatenate_strings_with( local_var(binding_ident(env)) + , params_text + , wrap_str_lit(", ")); + env = binding_next(env); + } + + return params_text; +} + +void handle_function_params(ast lst) { + while (lst != 0) { + ast decl = car_(DECL, lst); + assert_var_decl_is_safe(decl, true); + add_var_to_local_env(decl, BINDING_PARAM_LOCAL); + lst = tail(lst); + } +} + +void comp_glo_fun_decl(ast node) { + ast fun_decl = get_child__(FUN_DECL, DECL, node, 0); + ast body = get_child_opt_(FUN_DECL, '{', node, 1); + ast name_symbol = get_val_(IDENTIFIER, get_child__(DECL, IDENTIFIER, fun_decl, 0)); + ast fun_type = get_child__(DECL, '(', fun_decl, 1); + ast params = get_child_opt_('(', LIST, fun_type, 1); + int local_vars_decl_fixup; + + if (body == -1) return; // ignore forward declarations + + handle_function_params(params); + + // If the function is main + if (name_symbol == MAIN_ID) { + main_defined = true; + // If main has parameters, we'll prepare the argc/argv values in the epilogue. + if (params != 0) runtime_use_make_argv = true; + } + + local_vars_decl_fixup = append_glo_decl_fixup(); // Fixup is done after compiling body + + nest_level += 1; + comp_body(body, STMT_CTX_DEFAULT); + nest_level -= 1; + append_glo_decl(wrap_str_lit("}\n")); + + // Fixup local variable declarations + + text fun_decl_text = string_concat5( + wrap_str_lit("function "), + function_name(name_symbol), + wrap_str_lit("("), + comp_local_variables(), + wrap_str_lit(") {") + ); + fixup_glo_decl(local_vars_decl_fixup, fun_decl_text); +} + + +void comp_glo_var_decl(ast node) { + ast name = get_child__(DECL, IDENTIFIER, node, 0); + ast type = get_child_(DECL, node, 1); + ast init = get_child_(DECL, node, 2); + int arr_len; + + if (get_op(type) == '(') return; // Ignore function declarations + + // TODO: Add enum/struct/union to env if it's not already there + // handle_enum_struct_union_type_decl(type); + + assert_var_decl_is_safe(node, false); + + if (get_op(type) == '[') { // Array declaration + arr_len = get_child_('[', type, 1); + + if (arr_len == 0) { + fatal_error("Array declaration without size or initializer list"); + } else if (init != 0) { + fatal_error("Array declaration with initializer list not supported"); + } + + runtime_use_malloc = true; + + append_glo_decl( + string_concat4( + global_var(get_val_(IDENTIFIER, name)), + wrap_str_lit(" = _malloc("), + wrap_int(arr_len), + wrap_str_lit(")") + )); + } else { + if (init == 0) init = new_ast0(INTEGER, 0); + append_glo_decl(comp_assignment(name, init)); + } +} + +void comp_assignment_constant(text constant_name, ast rhs) { + append_glo_decl(string_concat3(constant_name, wrap_char('='), comp_rvalue(rhs))); +} + +// Enums are just like global variables, but they are readonly. +// Since anything that's not a local variable is considered global, this makes +// it easy to implement enums. +void comp_enum_cases(ast ident, ast cases) { + ast cas; + if (ident != 0) { + append_glo_decl(string_concat3(wrap_str_lit("#_ "), wrap_str_pool(get_val_(IDENTIFIER, ident)), wrap_str_lit(" enum declaration"))); + } else { + append_glo_decl(wrap_str_lit("#_ Enum declaration")); + } + + while (cases != 0) { + cas = car_('=', cases); + comp_assignment_constant(global_var(get_val_(IDENTIFIER, get_child__('=', IDENTIFIER, cas, 0))), get_child_('=', cas, 1)); + cases = tail(cases); + } +} + +#ifdef SUPPORT_STRUCT_UNION + +// Struct member access is implemented like array indexing. Each member is mapped +// to a readonly variable containing the offset of the member and accessing to +// s->a is equivalent to *(s + a). +// +// For example, for the struct: +// +// struct Point { +// int x; +// int y; +// } +// +// Point *p = malloc(sizeof(Point)); +// p->y = 42; +// +// The following code is generated: +// +// readonly __x=0 +// readonly __y=1 +// readonly __sizeof__Point=2 +// +// _malloc p $((__sizeof__Point)) +// : $(( _$((p + __x)) = 42 )) +// +// This approach doesn't work when the same member name is used in different +// structs, but it makes for readable code and is simple to implement. +// Because the member offset variables are declared as readonly, name conflicts +// will result in a runtime error when the shell program initializes. +void comp_struct(ast ident, ast members) { + ast decl; + int offset = new_ast0(INTEGER, 0); + int field_type; + if (ident != 0) { + append_glo_decl(string_concat3(wrap_str_lit("#_ "), wrap_str_pool(get_val_(IDENTIFIER, ident)), wrap_str_lit(" struct member declarations"))); + } else { + append_glo_decl(wrap_str_lit("#_ Struct member declarations")); + } + while (members != 0) { + decl = car_(DECL, members); + members = tail(members); + field_type = get_child_(DECL, decl, 1); + // Arrays and struct value types are not supported for now. + // When we have type information on the local and global variables, we'll + // be able to generate the correct code for these cases. + if (get_op(field_type) == '[' || get_op(field_type) == STRUCT_KW) { + fatal_error("Nested structures not supported by shell backend. Use a reference type instead."); + } + + comp_assignment_constant(struct_member_var(get_child_opt_(DECL, IDENTIFIER, decl, 0)), offset); + set_val(offset, get_val_(INTEGER, offset) - 1); + } + + if (ident != 0) { + comp_assignment_constant(struct_sizeof_var(ident), offset); + } + + append_glo_decl(0); // newline +} + +#endif + +void handle_enum_struct_union_type_decl(ast type) { + if (get_op(type) == ENUM_KW) { + comp_enum_cases(get_child_opt_(ENUM_KW, IDENTIFIER, type, 1), get_child_(ENUM_KW, type, 2)); + } +#ifdef SUPPORT_STRUCT_UNION + else if (get_op(type) == STRUCT_KW) { + comp_struct(get_child_opt_(STRUCT_KW, IDENTIFIER, type, 1), get_child_(STRUCT_KW, type, 2)); + } else if (get_op(type) == UNION_KW) { + fatal_error("handle_enum_struct_union_type_decl: union not supported"); + } +#endif + + // If not an enum, struct, or union, do nothing +} + +// For now, we don't do anything with the declarations in a typedef. +// The only thing we need to do is to call handle_enum_struct_union_type_decl +// on the type specifier. +void handle_typedef(ast node) { + ast decls = get_child__(TYPEDEF_KW, LIST, node, 0); + ast decl = car_(DECL, decls); + ast type = get_child_(DECL, decl, 1); + + handle_enum_struct_union_type_decl(get_type_specifier(type)); +} + +// This function compiles 1 top level declaration at the time. +// The supported top level declarations are: +// - global variable declarations +// - global variable assignments +// - function declarations +// - enum declarations +// - struct declarations +void comp_glo_decl(ast node) { + ast declarations; + int op = get_op(node); + + // Open init block if not already opened + if (op != FUN_DECL) { + // In AWK, assignments outside functions don't do anything, only assignments + // in functions have effect. Therefore, we wrap global variable declarations + // in setup functions, that are called at the start of main, with each + // setup function calling the previous one before initializing its own + // variables, so that variables are initialized in the correct order. + if (!init_block_open) { + init_block_open = true; + init_block_id += 1; + append_glo_decl(string_concat3( + wrap_str_lit("function setup_"), + wrap_int(init_block_id), + wrap_str_lit("() {") + )); + if (init_block_id > 1) { + append_glo_decl(string_concat3( + wrap_str_lit(" setup_"), + wrap_int(init_block_id - 1), + wrap_str_lit("()") + )); + } + nest_level += 1; + } + } else { + // Close init block if opened + if (init_block_open) { + init_block_open = false; + nest_level -= 1; + append_glo_decl(wrap_str_lit("}\n")); + } + } + + if (op == DECLS) { // Variable declarations + // AUTO_KW and REGISTER_KW can simply be ignored. STATIC_KW is the default + // storage class for global variables since pnut-sh only supports 1 + // translation unit. +#ifdef SUPPORT_TYPE_SPECIFIERS + if (get_child_(DECLS, node, 1) == EXTERN_KW) fatal_error("Extern storage class specifier not supported"); +#endif + declarations = get_child__(DECLS, LIST, node, 0); + while (declarations != 0) { // Multiple variable declarations + comp_glo_var_decl(car_(DECL, declarations)); + declarations = tail(declarations); + } + } else if (op == FUN_DECL) { + comp_glo_fun_decl(node); + } else if (op == TYPEDEF_KW) { + handle_typedef(node); + } else if (op == ENUM_KW +#ifdef SUPPORT_STRUCT_UNION + || op == STRUCT_KW || op == UNION_KW +#endif + ) { + handle_enum_struct_union_type_decl(node); + } else { + dump_node(node); + fatal_error("comp_glo_decl: unexpected declaration"); + } +} + + +// Required codegen interface functions +void codegen_begin() { + print_awk_shebang(); + putchar('\n'); +} + +void codegen_glo_decl(ast decl) { +#ifndef ONE_PASS_GENERATOR_NO_EARLY_OUTPUT + // Reset text and glo decls buffers + glo_decl_ix = 0; + text_alloc = 1; +#endif + + // Reset local environment + cgc_locals = cgc_locals_fun = 0; + cgc_fs = 1; // 1 to account for the return location parameter + + comp_glo_decl(decl); +#ifndef ONE_PASS_GENERATOR_NO_EARLY_OUTPUT + print_glo_decls(); +#endif + +#ifdef PRINT_MEMORY_STATS + // Statistics + max_text_alloc = max_text_alloc > text_alloc ? max_text_alloc : text_alloc; + cumul_text_alloc += text_alloc; +#endif +} + +void codegen_end() { +#ifdef ONE_PASS_GENERATOR_NO_EARLY_OUTPUT + print_glo_decls(); +#endif + + // Output main runtime + produce_runtime(); + putstr("BEGIN {\n"); + putstr(" if (ENVIRON[\"LC_ALL\"] != \"C\") {\n"); + putstr(" printf(\"Script must be executed with LC_ALL=C\\n\")\n"); + putstr(" exit 1\n"); + putstr(" }\n"); + putstr(" __ALLOC=1 # Allocation pointer\n"); + if (runtime_use_open || runtime_use_close || runtime_use_write || runtime_use_fgetc || runtime_use_read || runtime_use_fopen || runtime_use_fclose) { + putstr(" __next_fd=3 # Next available file descriptor\n"); + putstr(" __rt_file[0]=\"/dev/stdin\"; __rt_file[1]=\"/dev/stdout\"; __rt_file[2]=\"/dev/stderr\"\n"); + } + putstr(" for (i = 0; i < 256; i++) ord[sprintf(\"%c\", i)] = i # Initialize characters table\n"); + if (init_block_id > 0) { + putstr(" setup_"); putint(init_block_id); putstr("()\n"); + } + if (runtime_use_make_argv) { + putstr(" __argc = ARGC; __argv = make_argv(); ARGC = 1\n"); + putstr(" _main(__argc, __argv)\n"); + } else { + putstr(" _main()\n"); + } + putstr(" exit 0\n"); + putstr("}\n"); +} diff --git a/benchmark-bootstrap-with-options.sh b/benchmark-bootstrap-with-options.sh index ae2c9b0e..61c5607b 100644 --- a/benchmark-bootstrap-with-options.sh +++ b/benchmark-bootstrap-with-options.sh @@ -15,6 +15,6 @@ with_options() { with_options with_options "-DSH_SAVE_VARS_WITH_SET" -with_options "-DSH_INCLUDE_C_CODE" +with_options "-DANNOTATE_WITH_C_CODE" with_options "-DSH_INLINE_CHAR_LITERAL" with_options "-DSH_OPTIMIZE_LONG_LINES" diff --git a/env.c b/env.c index 94ae12f9..b655dbee 100644 --- a/env.c +++ b/env.c @@ -83,7 +83,7 @@ int cgc_add_local(const enum BINDING binding_type, const int ident, const ast ty return binding; } -#ifdef target_sh +#if defined(target_sh) || defined(target_awk) void cgc_add_local_var(const enum BINDING binding_type, const int ident, const ast type) { cgc_fs += 1; cgc_locals = cgc_add_local(binding_type, ident, type, cgc_locals); diff --git a/exe.c b/exe.c index ffed374c..b2b7a13f 100644 --- a/exe.c +++ b/exe.c @@ -609,7 +609,7 @@ void def_label(int lbl) { int addr = heap[lbl + 1]; int label_addr = code_alloc; - int next; + int next_addr; #ifdef SAFE_MODE if (heap[lbl] != GENERIC_LABEL) fatal_error("def_label expects generic label"); @@ -634,10 +634,10 @@ void def_label(int lbl) { } else { heap[lbl + 1] = - (code_address_base + code_alloc); // define label's address while (addr != 0) { - next = code[addr - 1]; // get pointer to next patch address before we overwrite it + next_addr = code[addr - 1]; // get pointer to next patch address before we overwrite it code_alloc = addr - 4; // place code pointer to where use_label was called emit_i32_le(label_addr - addr); // replace placeholder with relative address - addr = next; + addr = next_addr; } code_alloc = label_addr; } @@ -682,7 +682,7 @@ void def_goto_label(int lbl) { int addr = heap[lbl + 1]; int label_addr = code_alloc; - int next; + int next_addr; int goto_fs; int start_code_alloc; @@ -696,7 +696,7 @@ void def_goto_label(int lbl) { heap[lbl + 1] = -label_addr; // define label's address heap[lbl + 2] = cgc_fs; // define label's frame size while (addr != 0) { - next = code[addr-1]; // get pointer to next patch address + next_addr = code[addr-1]; // get pointer to next patch address goto_fs = code[addr-2]; // get frame size at goto instruction code_alloc = code[addr-3]; // reset code pointer to start of jump_to_goto_label instruction grow_stack(cgc_fs - goto_fs); // adjust stack @@ -705,7 +705,7 @@ void def_goto_label(int lbl) { addr = label_addr - code_alloc; // compute relative address code_alloc = start_code_alloc; jump_rel(addr); - addr = next; + addr = next_addr; } code_alloc = label_addr; } diff --git a/glo_decls.c b/glo_decls.c new file mode 100644 index 00000000..c8d57555 --- /dev/null +++ b/glo_decls.c @@ -0,0 +1,98 @@ +#define GLO_DECL_SIZE 100000 +#define GLO_DECL_ENTRY_SIZE 3 +text glo_decls[GLO_DECL_SIZE]; // Generated code +int glo_decl_ix = 0; // Index of last generated line of code +int nest_level = 0; // Current level of indentation + +void append_glo_decl(text decl) { + if (glo_decl_ix + GLO_DECL_ENTRY_SIZE >= GLO_DECL_SIZE) fatal_error("glo_decls overflow"); + glo_decls[glo_decl_ix] = nest_level; + glo_decls[glo_decl_ix + 1] = 1; // If it's active or not. Used by undo_glo_decls and replay_glo_decls + glo_decls[glo_decl_ix + 2] = decl; + glo_decl_ix += GLO_DECL_ENTRY_SIZE; +} + +// Fixups are represented as negative nest levels (-1, -2, ...). The actual +// nest level is obtained by negating the value and subtracting 1. +int append_glo_decl_fixup() { + if (glo_decl_ix + GLO_DECL_ENTRY_SIZE >= GLO_DECL_SIZE) fatal_error("glo_decls overflow"); + glo_decls[glo_decl_ix] = - (nest_level + 1); + glo_decls[glo_decl_ix + 1] = 1; // If it's active or not. Used by undo_glo_decls and replay_glo_decls + glo_decls[glo_decl_ix + 2] = 0; + glo_decl_ix += GLO_DECL_ENTRY_SIZE; + return glo_decl_ix - GLO_DECL_ENTRY_SIZE; +} + +void fixup_glo_decl(int fixup_ix, text decl) { + if (glo_decls[fixup_ix] >= 0) fatal_error("fixup_glo_decl: invalid fixup"); + + glo_decls[fixup_ix] = -glo_decls[fixup_ix] - 1; // Make nest level positive + glo_decls[fixup_ix + 2] = decl; +} + +// Remove the n last declarations by decrementing the active field. +// A non-positive active value means that the declaration is active, +// A 0 value means that the declaration was unset once. +// A negative value means that the declaration was unset multiple times. +// Because undone declarations are generally replayed, declarations with negative +// values are ignored when replayed since they have already been replayed before. +// This is useful to compile some code at a different time than it is used. +void undo_glo_decls(int start) { + while (start < glo_decl_ix) { + glo_decls[start + 1] -= 1; // To support nested undone declarations + start += GLO_DECL_ENTRY_SIZE; + } +} + +// Check if there are any active and non-empty declarations since the start index. +// This is used to determine if a ':' statement must be added to the current block. +bool any_active_glo_decls(int start) { + while (start < glo_decl_ix) { + if (glo_decls[start + 1] && glo_decls[start + 2] != 0) return true; + start += GLO_DECL_ENTRY_SIZE; + } + return false; +} + +// Replay the declarations betwee start and end. Replayed declarations must first +// be undone with undo_glo_decls. +void replay_glo_decls(int start, int end) { + while (start < end) { + if (glo_decls[start + 1] == 0) { // Skip inactive declarations that are at the current level + append_glo_decl(glo_decls[start + 2]); + } + start += GLO_DECL_ENTRY_SIZE; + } +} + +text replay_glo_decls_inline(int start, int end) { + text res = 0; + while (start < end) { + if (glo_decls[start + 1] == 0) { // Skip inactive declarations + res = concatenate_strings_with(res, glo_decls[start + 2], wrap_str_lit("; ")); + } + start += GLO_DECL_ENTRY_SIZE; + } + if (res != 0) { res = string_concat(res, wrap_str_lit("; ")); } + + return res; +} + +void print_glo_decls() { + int i = 0; + int level; + while (i < glo_decl_ix) { + if (glo_decls[i + 1] == 1) { // Skip inactive declarations + if (glo_decls[i + 2] != 0) { + level = glo_decls[i]; + while (level > 0) { + putchar(' '); putchar(' '); + level -= 1; + } + print_text(glo_decls[i + 2]); + putchar('\n'); + } + } + i += GLO_DECL_ENTRY_SIZE; + } +} diff --git a/pnut-lib.c b/pnut-lib.c index 03cb3803..ac2b8905 100644 --- a/pnut-lib.c +++ b/pnut-lib.c @@ -7,7 +7,7 @@ void compile(bool annotate, char* file) { int i; ast decl; -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE code_annotations_quiet_mode = !annotate; #endif @@ -24,7 +24,7 @@ void compile(bool annotate, char* file) { while (tok != EOF) { decl = parse_declaration(false); -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE if (!code_annotations_quiet_mode) { output_declaration_c_code(); } diff --git a/pnut.c b/pnut.c index b2827dd1..47da7cd5 100644 --- a/pnut.c +++ b/pnut.c @@ -8,11 +8,11 @@ // =========================== configuration options =========================== // -// Pnut has many compilation options to change the features supported by pnut-sh -// and pnut-exe, and the code generation. +// Pnut has many compilation options to change the features supported by +// pnut-sh, pnut-awk and pnut-exe, and the code generation. // The reason for so many options is that pnut is really 2 different compilers -// (pnut-sh and pnut-exe) sharing the same compiler frontend, with each compiler -// having to accomplish 2 opposite goals: +// (pnut-sh/pnut-awk and pnut-exe) sharing the same compiler frontend, with each +// compiler having to accomplish 2 opposite goals: // // 1. Be as small and simple as possible to bootstrap a C-like language from a // POSIX shell implementation. Importantly, this applies to both the @@ -33,17 +33,17 @@ // However, the preprocessor directives can make the code slightly harder to // read and maintain. To make reviewing the code easier, we recommend reviewing // the annotated pnut-sh.sh and pnut-exe.sh script generated by pnut-sh (with -// the SH_INCLUDE_C_CODE option enabled). +// the ANNOTATE_WITH_C_CODE option enabled). // // Since most options are orthogonal, the number of possible combinations is // very large. To avoid combinatorial explosion of testing and maintenance, // we define a few "profiles" that cover the common use cases of pnut: // // - PNUT_BOOTSTRAP: -// Profile used to generate pnut-sh.sh and to bootstrap pnut-exe from a -// POSIX shell. This profile minimizes the code size and complexity of the -// compilers as much as possible, supporting only the features strictly -// needed to bootstrap pnut. +// Profile used to generate pnut-sh.sh/pnut-awk.awk and to bootstrap pnut-exe +// from a POSIX shell/AWK. This profile minimizes the code size and +// complexity of the compilers as much as possible, supporting only the +// features strictly needed to bootstrap pnut. // // - (Default): // General purpose profile used to build pnut-exe for real world usage. This @@ -70,7 +70,7 @@ #ifdef PNUT_BOOTSTRAP #define ALLOW_RECURSIVE_MACROS - #define SH_MINIMAL_RUNTIME + #define MINIMAL_RUNTIME #define SH_INCLUDE_ALL_ALPHANUM_CHARACTERS // Remove support for complex printf specifiers (flags, width, precision). // This results in smaller code for the compiler. @@ -138,7 +138,7 @@ // Disabled options: // Include the C code as comment along with the generated shell code - // #define SH_INCLUDE_C_CODE + // #define ANNOTATE_WITH_C_CODE // Replace character literal with character code instead of character variables. // Results in smaller and faster code, but with magic numbers in the output. @@ -178,6 +178,34 @@ #undef RT_USE_LOOKUP_TABLE #endif +#elif defined(target_awk) + + #ifdef PNUT_BOOTSTRAP + #define ALLOW_RECURSIVE_MACROS + #define MINIMAL_RUNTIME + #else + // Enable all C features for general pnut usage + #define SUPPORT_ALL_C_FEATURES + // Pnut-awk specific features + #define AWK_SUPPORT_ADDRESS_OF + #endif + + // Shell code generation options + #ifndef AWK_INLINE_PRINTF_NOT + // Inline printf calls with literal string for smaller and faster code + #define AWK_INLINE_PRINTF + #endif + #ifndef AWK_INLINE_PUTCHAR_NOT + // Inline putchar calls for smaller and faster code + #define AWK_INLINE_PUTCHAR + #endif + // Inline exit calls for smaller code + #define AWK_INLINE_EXIT + + // Disabled options: + // Include the C code as comment along with the generated shell code + // #define ANNOTATE_WITH_C_CODE + #elif defined(target_i386_linux) || defined (target_x86_64_linux) || defined (target_x86_64_mac) // Parse numeric literals with their suffix (U, L, UL, etc). @@ -228,7 +256,7 @@ // #define USE_STACK_FOR_GLOBALS // Pnut-exe doesn't support generating annotated code. - #undef SH_INCLUDE_C_CODE + #undef ANNOTATE_WITH_C_CODE #else // Frontend-only variants of pnut (e.g. for running reader, tokenizer or parser) @@ -401,7 +429,7 @@ void putoct_unsigned(int n) { #ifdef NICE_ERR_MSG -#if defined(SH_MINIMAL_RUNTIME) || defined(NO_COLOR) +#if defined(MINIMAL_RUNTIME) || defined(NO_COLOR) // No isatty support in minimal runtime #define change_color(color) @@ -907,7 +935,7 @@ ast list3(const int child0, const int child1, const int child2) { return new_ast #endif #define tail(x) cdr_(LIST, x) -#ifdef target_sh +#if defined(target_sh) || defined(target_awk) // Returns the only element of a singleton list, if it is a singleton list. // Otherwise, returns 0. ast list_singleton(const ast list) { @@ -1110,7 +1138,7 @@ void dump_ident(int symbol) { void dump_node(ast node) { putstr("op="); putint(get_op(node)); - putstr(" with #children ="); + putstr(" with #children="); putint(get_nb_children(node)); putchar('\n'); } @@ -1134,7 +1162,7 @@ int if_macro_stack[IFDEF_DEPTH_MAX]; // Stack of if macro states int if_macro_stack_ix = 0; bool if_macro_mask = true; // Indicates if the current if/elif block is being executed bool if_macro_executed = false; // If any of the previous if/elif conditions were true -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE bool if_macro_keep_directive_block_code = false; // Whether to keep the code of the current directive #define IFDEF_STACK_ENTRY_SIZE 3 #else @@ -1171,7 +1199,7 @@ void push_if_macro_mask(bool new_mask) { // Save current mask on the stack because it's about to be overwritten if_macro_stack[if_macro_stack_ix] = if_macro_mask; if_macro_stack[if_macro_stack_ix + 1] = if_macro_executed; -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Once if_macro_keep_directive_block_code is set, it is kept for all nested blocks. if_macro_stack[if_macro_stack_ix + 2] = if_macro_keep_directive_block_code; #endif @@ -1191,13 +1219,13 @@ void pop_if_macro_mask() { if_macro_stack_ix -= IFDEF_STACK_ENTRY_SIZE; if_macro_mask = if_macro_stack[if_macro_stack_ix]; if_macro_executed = if_macro_stack[if_macro_stack_ix + 1]; -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE if_macro_keep_directive_block_code = if_macro_stack[if_macro_stack_ix + 2]; #endif } // Includes the preprocessed C code along with the generated shell code -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE #define C_CODE_BUF_LEN 200000 char code_char_buf[C_CODE_BUF_LEN]; @@ -1338,7 +1366,7 @@ void output_defined_cli_macros() { } } -#endif // SH_INCLUDE_C_CODE +#endif // ANNOTATE_WITH_C_CODE #ifdef SUPPORT_LINE_CONTINUATION // get_ch_ is reponsible for reading the next character from the input file, @@ -1395,7 +1423,7 @@ void get_ch() { column_number += 1; } #endif -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Save C code chars so they can be displayed with the shell code code_char_buf[code_char_buf_ix] = ch; code_char_buf_ix += 1; @@ -1696,7 +1724,7 @@ int NOT_SUPPORTED_ID; // We want to recognize certain identifers without having to do expensive string comparisons int MAIN_ID; -#ifdef target_sh +#if defined(target_sh) || defined(target_awk) int PUTCHAR_ID; int GETCHAR_ID; int EXIT_ID; @@ -1712,7 +1740,7 @@ int READ_ID; int WRITE_ID; int OPEN_ID; int CLOSE_ID; -#if !defined(SH_MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) int ISATTY_ID; #endif @@ -1743,9 +1771,11 @@ int PWD_ID; int FILE__ID; int LINE__ID; #endif -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE int PNUT_CC_ID; -int PNUT_SH_ID; +#if defined(target_sh) || defined(target_awk) +int PNUT_TARGET_ID; +#endif #endif void get_tok(); @@ -2035,7 +2065,7 @@ bool handle_include() { void handle_preprocessor_directive() { int temp; while (1) { -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE int hash_code_buf_ix = code_char_buf_ix; int dir_tok, dir_val; // Forces the inclusion of the directive code in the C code buffer. @@ -2047,7 +2077,7 @@ void handle_preprocessor_directive() { get_tok_macro(); // Get the # token get_tok_macro(); // Get the directive -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE dir_tok = tok; dir_val = val; #endif @@ -2056,12 +2086,12 @@ void handle_preprocessor_directive() { temp = val; get_tok_macro(); // Get the macro name push_if_macro_mask(TERNARY(temp == IFDEF_ID, tok == MACRO, tok != MACRO)); -#ifdef SH_INCLUDE_C_CODE - // In SH_INCLUDE_C_CODE mode, we want to hide the conditional preprocessor +#ifdef ANNOTATE_WITH_C_CODE + // In ANNOTATE_WITH_C_CODE mode, we want to hide the conditional preprocessor // directives from the C code buffer, except for those using PNUT_SH // so that when we extract the C code from pnut-exe.sh and bootstrap // pnut-exe from it, the C code contains the necessary directives. - if_macro_keep_directive_block_code |= (val == PNUT_SH_ID || val == PNUT_CC_ID); + if_macro_keep_directive_block_code |= (val == PNUT_TARGET_ID || val == PNUT_CC_ID); #endif get_tok_macro(); // Skip the macro name } else if (tok == IF_KW) { @@ -2089,7 +2119,7 @@ void handle_preprocessor_directive() { } else if (if_macro_mask) { if (tok == IDENTIFIER && val == INCLUDE_ID) { get_tok_macro(); // Get the STRING token -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE keep_directive_code = #endif handle_include(); @@ -2143,7 +2173,7 @@ void handle_preprocessor_directive() { syntax_error("preprocessor expected end of line"); } -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE if (!if_macro_keep_directive_block_code && !keep_directive_code && (( dir_tok == IDENTIFIER && (dir_val == IFDEF_ID || dir_val == IFNDEF_ID || dir_val == ELIF_ID || dir_val == ENDIF_ID || dir_val == INCLUDE_ID)) || dir_tok == IF_KW || dir_tok == ELSE_KW)) { @@ -2172,7 +2202,7 @@ void handle_preprocessor_directive() { if (!if_macro_mask) { while (!skip_inactive_line()); if (ch == EOF) return; -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Remove the inactive line from the code buffer // The code buffer now contains a bunch of lines that were skipped, and // the start of the next preprocessor directive (up to the '#'). Remove @@ -2298,7 +2328,7 @@ void init_ident_table() { MAIN_ID = init_ident(IDENTIFIER, "main"); -#ifdef target_sh +#if defined(target_sh) || defined(target_awk) PUTCHAR_ID = init_ident(IDENTIFIER, "putchar"); GETCHAR_ID = init_ident(IDENTIFIER, "getchar"); EXIT_ID = init_ident(IDENTIFIER, "exit"); @@ -2314,7 +2344,7 @@ void init_ident_table() { WRITE_ID = init_ident(IDENTIFIER, "write"); OPEN_ID = init_ident(IDENTIFIER, "open"); CLOSE_ID = init_ident(IDENTIFIER, "close"); -#if !defined(SH_MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) ISATTY_ID = init_ident(IDENTIFIER, "isatty"); #endif @@ -2377,7 +2407,7 @@ int set_builtin_empty_macro(int macro_id) { #endif void init_pnut_macros() { -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE PNUT_CC_ID = #endif init_builtin_int_macro("PNUT_CC", 1); @@ -2390,11 +2420,13 @@ void init_pnut_macros() { LINE__ID = init_builtin_int_macro("__LINE__", 0); #endif -#if defined(target_sh) -#ifdef SH_INCLUDE_C_CODE - PNUT_SH_ID = +#ifdef ANNOTATE_WITH_C_CODE + PNUT_TARGET_ID = #endif +#if defined(target_sh) init_builtin_int_macro("PNUT_SH", 1); +#elif defined(target_awk) + init_builtin_int_macro("PNUT_AWK", 1); #elif defined(target_i386_linux) init_builtin_int_macro("PNUT_EXE", 1); init_builtin_int_macro("PNUT_EXE_32", 1); @@ -2724,7 +2756,7 @@ void paste_tokens(int left_tok, int left_val) { void get_tok() { -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Compute the index of the previous token. Because get_tok can be called // recursively by handle_preprocessor_directive, it must be stored in a local // variable so that the first get_tok call in the recursion chain can restore @@ -3169,7 +3201,7 @@ void get_tok() { } } -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE last_tok_code_buf_ix = prev_last_tok_char_buf_ix - 1; #endif @@ -4650,6 +4682,10 @@ ast parse_compound_statement() { #include "sh.c" #endif +#ifdef target_awk +#include "awk.c" +#endif + #ifdef target_i386_linux #include "x86.c" #endif @@ -4715,7 +4751,7 @@ void handle_macro_D(char *opt) { set_builtin_int_macro(macro_symbol, 1); } -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Add to the list of macros to define/undefine in the generated shell script cli_macros = cons(macro_symbol, cli_macros); #endif @@ -4723,7 +4759,7 @@ void handle_macro_D(char *opt) { void handle_macro_U(char *opt) { init_ident(IDENTIFIER, opt); -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Add to the list of macros to define/undefine in the generated shell script cli_macros = cons(intern_str(opt), cli_macros); #endif @@ -4749,7 +4785,7 @@ void output_rest_of_line(FILE *fp) { if (c == '\n') putchar('\n'); } -void extract_c_code_from_sh_file(char *filename) { +void extract_c_code_from_annotated_file(char *filename) { int c; FILE *sh_fp = fopen(filename, "r"); @@ -4856,7 +4892,7 @@ int main(int argc, char **argv) { case 'D': // pnut-sh only needs -D and no other options init_builtin_int_macro(argv[i] + 2, 1); // +2 to skip -D -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Also add to the list of macros to define in the generated shell script cli_macros = cons(intern_str(argv[i] + 2), cli_macros); #endif @@ -4869,13 +4905,13 @@ int main(int argc, char **argv) { if (argv[i][2] == 0) { // rest of option is in argv[i + 1] if (argv[i + 1] == 0) fatal_error("missing input file name for -C option"); i += 1; - extract_c_code_from_sh_file(argv[i]); + extract_c_code_from_annotated_file(argv[i]); } else { - extract_c_code_from_sh_file(argv[i] + 2); + extract_c_code_from_annotated_file(argv[i] + 2); } return 0; // Done after extracting C code, no further processing needed #endif -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE case 'q': // disable code annotations code_annotations_quiet_mode = true; break; @@ -4939,14 +4975,14 @@ int main(int argc, char **argv) { } #else codegen_begin(); -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE // Add #define and #undef directives for macros defined on the command line output_defined_cli_macros(); #endif get_tok(); while (tok != EOF) { decl = parse_declaration(false); -#ifdef SH_INCLUDE_C_CODE +#ifdef ANNOTATE_WITH_C_CODE if (!code_annotations_quiet_mode) { output_declaration_c_code(); } diff --git a/run-tests.sh b/run-tests.sh index 6cf45322..39cab476 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -64,6 +64,13 @@ case "$backend" in test_folders="tests/_all tests/_sh tests/_bug" [ "$fast" -eq 1 ] && pnut_target_flag="$pnut_target_flag -DSH_SAVE_VARS_WITH_SET" ;; + awk) + ext="awk" + pnut_target_flag="-Dtarget_awk" + executor="awk --posix -f" + # Reusing _sh tests for awk as well since they support the same subset of features. + test_folders="tests/_all tests/_sh tests/_bug" + ;; i386_linux|x86_64_linux|x86_64_mac) ext="exe" pnut_target_flag="-Dtarget_$backend" diff --git a/sh-runtime.c b/sh-runtime.c index ceef4a86..f2ac72a6 100644 --- a/sh-runtime.c +++ b/sh-runtime.c @@ -572,7 +572,7 @@ void runtime_putchar() { } #endif // SH_INLINE_PUTCHAR -#ifndef SH_MINIMAL_RUNTIME +#ifndef MINIMAL_RUNTIME bool runtime_use_getchar = DEFAULT_USE; bool runtime_getchar_defined = false; @@ -628,7 +628,7 @@ void runtime_getchar() { putstr("\n"); } -#endif // SH_MINIMAL_RUNTIME +#endif // MINIMAL_RUNTIME // An implementation of puts, used to replace printf("%s", ...) calls. bool runtime_use_put_pstr = DEFAULT_USE; @@ -646,7 +646,7 @@ void runtime_put_pstr() { putstr("\n"); } -#ifndef SH_MINIMAL_RUNTIME +#ifndef MINIMAL_RUNTIME // POSIX shell printf documentation: https://web.archive.org/web/20240829022722/https://pubs.opengroup.org/onlinepubs/9699919799/utilities/printf.html // C printf documentation: ISO/IEC 9899:1999 - 7.19.6 Formatted input/output functions (page 273) @@ -783,9 +783,9 @@ void runtime_unpack_string() { putstr("\n"); } -#endif // !SH_MINIMAL_RUNTIME +#endif // !MINIMAL_RUNTIME -#if !defined(SH_MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) bool runtime_use_isatty = DEFAULT_USE; bool runtime_isatty_defined = false; @@ -796,7 +796,7 @@ void runtime_isatty() { putstr("}\n\n"); } -#endif // !SH_MINIMAL_RUNTIME || SUPPORT_STDIN_INPUT +#endif // !MINIMAL_RUNTIME || SUPPORT_STDIN_INPUT bool runtime_use_open = DEFAULT_USE; bool runtime_open_defined = false; @@ -1042,12 +1042,12 @@ void produce_runtime() { if (runtime_use_exit) runtime_exit(); #endif -#ifndef SH_MINIMAL_RUNTIME +#ifndef MINIMAL_RUNTIME if (runtime_use_getchar) runtime_getchar(); if (runtime_use_printf) runtime_printf(); if (runtime_use_unpack_string) runtime_unpack_string(); #endif -#if !defined(SH_MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) if (runtime_use_isatty) runtime_isatty(); #endif } diff --git a/sh.c b/sh.c index 9c4ce318..705d262b 100644 --- a/sh.c +++ b/sh.c @@ -29,22 +29,8 @@ int cumul_text_alloc = 0; // codegen -#define text int -#define TEXT_POOL_SIZE 1000000 -intptr_t text_pool[TEXT_POOL_SIZE]; -int text_alloc = 1; // Start at 1 because 0 is the empty text - -// Text pool nodes -enum TEXT_NODES { - TEXT_TREE, // Concatenation of texts - TEXT_INTEGER, // Integer to be printed in decimal -#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE - TEXT_INTEGER_HEX, // Integer to be printed in hexadecimal - TEXT_INTEGER_OCT, // Integer to be printed in octal -#endif - TEXT_STRING, // Pointer to immutable string - TEXT_ESCAPED // Escaped string, used for printf -}; +// Rope-like text representation +#include "text.c" // Place prototype of mutually recursive functions here @@ -71,284 +57,7 @@ void mark_mutable_variables_body(ast node); void handle_enum_struct_union_type_decl(ast node); ast handle_side_effects_go(ast node, bool executes_conditionally); -// Because concatenating strings is very expensive and a common operation, we -// use a tree structure to represent the concatenated strings. That way, the -// concatenation can be done in O(1). -// At the end of the codegen process, the tree will be flattened into a single -// string. - -// A few macros to help us change the representation of text objects -#define TEXT_FROM_INT(i) i -#define TEXT_FROM_CHAR(i) i -#define TEXT_FROM_PTR(p) ((intptr_t) (p)) -#define TEXT_TO_INT(p) ((int) (p)) -#define TEXT_TO_CHAR(p) ((char) (p)) - -#define wrap_char(c) (-c) - -text wrap_int(const int i) { - if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER); - text_pool[text_alloc + 1] = TEXT_FROM_INT(i); - return (text_alloc += 2) - 2; -} - -#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE - -text wrap_int_hex(const int i) { - if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER_HEX); - text_pool[text_alloc + 1] = TEXT_FROM_INT(i); - return (text_alloc += 2) - 2; -} - -text wrap_int_oct(const int i) { - if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER_OCT); - text_pool[text_alloc + 1] = TEXT_FROM_INT(i); - return (text_alloc += 2) - 2; -} - -text wrap_integer(const int multiply, const int obj) { - switch (get_op(obj)) { - case INTEGER: - return wrap_int(multiply * -get_val_(INTEGER, obj)); - case INTEGER_HEX: - return wrap_int_hex(multiply * -get_val_(INTEGER_HEX, obj)); - case INTEGER_OCT: - return wrap_int_oct(multiply * -get_val_(INTEGER_OCT, obj)); - default: - fatal_error("wrap_integer: unknown integer type"); - return 0; - } -} -#else -#define wrap_integer(multiply, obj) wrap_int(multiply * -get_val_(INTEGER, obj)) -#endif - -text escape_text(const text t, const bool for_printf) { - if (text_alloc + 3 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_ESCAPED); - text_pool[text_alloc + 1] = TEXT_FROM_INT(t); - text_pool[text_alloc + 2] = TEXT_FROM_INT(for_printf); - return (text_alloc += 3) - 3; -} - -text string_concat(const text t1, const text t2) { - if (text_alloc + 4 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); - text_pool[text_alloc + 1] = TEXT_FROM_INT(2); - text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); - text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); - return (text_alloc += 4) - 4; -} - -text string_concat3(const text t1, const text t2, const text t3) { - if (text_alloc + 5 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); - text_pool[text_alloc + 1] = TEXT_FROM_INT(3); - text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); - text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); - text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); - return (text_alloc += 5) - 5; -} - -text string_concat4(const text t1, const text t2, const text t3, const text t4) { - if (text_alloc + 6 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); - text_pool[text_alloc + 1] = TEXT_FROM_INT(4); - text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); - text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); - text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); - text_pool[text_alloc + 5] = TEXT_FROM_INT(t4); - return (text_alloc += 6) - 6; -} - -text string_concat5(const text t1, const text t2, const text t3, const text t4, const text t5) { - if (text_alloc + 7 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); - text_pool[text_alloc + 1] = TEXT_FROM_INT(5); - text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); - text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); - text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); - text_pool[text_alloc + 5] = TEXT_FROM_INT(t4); - text_pool[text_alloc + 6] = TEXT_FROM_INT(t5); - return (text_alloc += 7) - 7; -} - -// Dead code but keeping it around in case we need to wrap mutable strings -// text wrap_str(char * const s) { -// int i = 0; -// int result = text_alloc; -// -// text_pool[result] = TEXT_FROM_INT(TEXT_TREE); -// text_alloc += 2; -// while (s[i] != 0) { -// text_pool[text_alloc] = wrap_char(s[i]); -// text_alloc += 1; -// i += 1; -// } -// -// text_pool[result + 1] = TEXT_FROM_INT(i); -// -// return result; -// } - -// Like wrap_str, but assumes that the string is immutable and doesn't need to be copied -text wrap_str_imm(char * const s, char * const end) { - if (text_alloc + 3 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); - text_pool[text_alloc] = TEXT_FROM_INT(TEXT_STRING); - text_pool[text_alloc + 1] = TEXT_FROM_PTR(s); - text_pool[text_alloc + 2] = TEXT_FROM_PTR(end); // end of string address. 0 for null-terminated strings - return (text_alloc += 3) - 3; -} - -text wrap_str_lit(char * const s) { - return wrap_str_imm(s, 0); -} - -text wrap_str_pool(const int ident_symbol) { - return wrap_str_imm(symbol_buf(ident_symbol), 0); -} - -text concatenate_strings_with(const text t1, const text t2, const text sep) { - if (t1 == 0) return t2; - if (t2 == 0) return t1; - return string_concat3(t1, sep, t2); -} - -void print_escaped_char(char c, int for_printf) { - // C escape sequences - if (c == '\0') { putchar('\\'); putchar('0'); } - else if (c == '\a') { putchar('\\'); putchar('a'); } - else if (c == '\b') { putchar('\\'); putchar('b'); } - else if (c == '\f') { putchar('\\'); putchar('f'); } - else if (c == '\n') { putchar('\\'); putchar('n'); } - else if (c == '\r') { putchar('\\'); putchar('r'); } - else if (c == '\t') { putchar('\\'); putchar('t'); } - else if (c == '\v') { putchar('\\'); putchar('v'); } - // backslashes are escaped twice, first by the shell and then by def_str - else if (c == '\\') { putchar('\\'); putchar('\\'); putchar('\\'); putchar('\\'); } - // Shell special characters: $, `, ", ', ?, and newline - // Note that ' and ? are not escaped properly by dash, but that's ok because - // we use double quotes and ' and ? can be left as is. - else if (c == '$') { putchar('\\'); putchar('$'); } - else if (c == '`') { putchar('\\'); putchar('`'); } - else if (c == '"') { putchar('\\'); putchar('"'); } - // else if (c == '\'') { putchar('\\'); putchar('\''); } - // else if (c == '?') { putchar('\\'); putchar('?'); } - // when we're escaping a string for shell's printf, % must be escaped - else if (c == '%' && for_printf) { putchar('%'); putchar('%'); } - else putchar(c); -} - -void print_escaped_string(char *string_start, char *string_end, int for_printf) { - if (string_end) { - while (string_start < string_end) { - print_escaped_char(*string_start, for_printf); - string_start += 1; - } - } else { - while (*string_start != 0) { - print_escaped_char(*string_start, for_printf); - string_start += 1; - } - } -} - -void print_escaped_text(text t, bool for_printf) { - int i; - - if (t == 0) return; - - if (t < 0) { // it's a character - print_escaped_char(-t, for_printf); - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_TREE)) { - i = 0; - while (TEXT_FROM_INT(i) < text_pool[t + 1]) { - if (text_pool[t + i + 2] < 0) { - print_escaped_char(-TEXT_TO_CHAR(text_pool[t + i + 2]), for_printf); - } else { - print_escaped_text(TEXT_TO_INT(text_pool[t + i + 2]), for_printf); - } - i += 1; - } - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER)) { - putint(TEXT_TO_INT(text_pool[t + 1])); - } -#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE - else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_HEX)) { - putchar('0'); putchar('x'); - puthex_unsigned(TEXT_TO_INT(text_pool[t + 1])); - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_OCT)) { - putchar('0'); // Note: This is not supported by zsh by default - putoct_unsigned(TEXT_TO_INT(text_pool[t + 1])); - } -#endif - else if (text_pool[t] == TEXT_FROM_INT(TEXT_STRING)) { - print_escaped_string((char*) text_pool[t + 1], (char*) text_pool[t + 2], for_printf); - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_ESCAPED)) { - fatal_error("Cannot escape a string that is already escaped"); - } else { - fatal_error("print_escaped_text: unexpected string tree node"); - } -} - -void print_text(text t) { - int i; - char *s; - - if (t == 0) return; - - if (t < 0) { // it's a character - putchar(-t); - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_TREE)) { - i = 0; - while (TEXT_FROM_INT(i) < text_pool[t + 1]) { - if (text_pool[t + i + 2] < 0) { - putchar(-TEXT_TO_CHAR(text_pool[t + i + 2])); - } else { - print_text(TEXT_TO_INT(text_pool[t + i + 2])); - } - i += 1; - } - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER)) { - putint(TEXT_TO_INT(text_pool[t + 1])); - } -#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE - else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_HEX)) { - putchar('0'); putchar('x'); - puthex_unsigned(TEXT_TO_INT(text_pool[t + 1])); - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_OCT)) { - putchar('0'); // Note: This is not supported by zsh by default - putoct_unsigned(TEXT_TO_INT(text_pool[t + 1])); - } -#endif - else if (text_pool[t] == TEXT_FROM_INT(TEXT_STRING)) { - if (TEXT_TO_INT(text_pool[t + 2]) == 0) { // null-terminated string - putstr((char*) text_pool[t + 1]); - } else { // string ends at the address in text_pool[t + 2] - s = (char*) text_pool[t + 1]; // start - while (s < (char*) text_pool[t + 2] || *s != 0) { - putchar(*s); - s += 1; - } - } - } else if (text_pool[t] == TEXT_FROM_INT(TEXT_ESCAPED)) { - print_escaped_text(TEXT_TO_INT(text_pool[t + 1]), TEXT_TO_INT(text_pool[t + 2])); - } else { - fatal_error("print_text: unexpected string tree node"); - } -} - // Codegen context - -#define GLO_DECL_SIZE 100000 -#define GLO_DECL_ENTRY_SIZE 3 -text glo_decls[GLO_DECL_SIZE]; // Generated code -int glo_decl_ix = 0; // Index of last generated line of code -int nest_level = 0; // Current level of indentation int in_tail_position = false; // Is the current statement in tail position? int gensym_ix = 0; // Counter for fresh_ident int fun_gensym_ix = 0; // Maximum value of gensym_ix for the current function @@ -408,98 +117,8 @@ void init_comp_context() { } } -void append_glo_decl(text decl) { - if (glo_decl_ix + GLO_DECL_ENTRY_SIZE >= GLO_DECL_SIZE) fatal_error("glo_decls overflow"); - glo_decls[glo_decl_ix] = nest_level; - glo_decls[glo_decl_ix + 1] = 1; // If it's active or not. Used by undo_glo_decls and replay_glo_decls - glo_decls[glo_decl_ix + 2] = decl; - glo_decl_ix += GLO_DECL_ENTRY_SIZE; -} - -// Fixups are represented as negative nest levels (-1, -2, ...). The actual -// nest level is obtained by negating the value and subtracting 1. -int append_glo_decl_fixup() { - if (glo_decl_ix + GLO_DECL_ENTRY_SIZE >= GLO_DECL_SIZE) fatal_error("glo_decls overflow"); - glo_decls[glo_decl_ix] = - (nest_level + 1); - glo_decls[glo_decl_ix + 1] = 1; // If it's active or not. Used by undo_glo_decls and replay_glo_decls - glo_decls[glo_decl_ix + 2] = 0; - glo_decl_ix += GLO_DECL_ENTRY_SIZE; - return glo_decl_ix - GLO_DECL_ENTRY_SIZE; -} - -void fixup_glo_decl(int fixup_ix, text decl) { - if (glo_decls[fixup_ix] >= 0) fatal_error("fixup_glo_decl: invalid fixup"); - - glo_decls[fixup_ix] = -glo_decls[fixup_ix] - 1; // Make nest level positive - glo_decls[fixup_ix + 2] = decl; -} - -// Remove the n last declarations by decrementing the active field. -// A non-positive active value means that the declaration is active, -// A 0 value means that the declaration was unset once. -// A negative value means that the declaration was unset multiple times. -// Because undone declarations are generally replayed, declarations with negative -// values are ignored when replayed since they have already been replayed before. -// This is useful to compile some code at a different time than it is used. -void undo_glo_decls(int start) { - while (start < glo_decl_ix) { - glo_decls[start + 1] -= 1; // To support nested undone declarations - start += GLO_DECL_ENTRY_SIZE; - } -} - -// Check if there are any active and non-empty declarations since the start index. -// This is used to determine if a ':' statement must be added to the current block. -bool any_active_glo_decls(int start) { - while (start < glo_decl_ix) { - if (glo_decls[start + 1] && glo_decls[start + 2] != 0) return true; - start += GLO_DECL_ENTRY_SIZE; - } - return false; -} - -// Replay the declarations betwee start and end. Replayed declarations must first -// be undone with undo_glo_decls. -void replay_glo_decls(int start, int end) { - while (start < end) { - if (glo_decls[start + 1] == 0) { // Skip inactive declarations that are at the current level - append_glo_decl(glo_decls[start + 2]); - } - start += GLO_DECL_ENTRY_SIZE; - } -} - -text replay_glo_decls_inline(int start, int end) { - text res = 0; - while (start < end) { - if (glo_decls[start + 1] == 0) { // Skip inactive declarations - res = concatenate_strings_with(res, glo_decls[start + 2], wrap_str_lit("; ")); - } - start += GLO_DECL_ENTRY_SIZE; - } - if (res != 0) { res = string_concat(res, wrap_str_lit("; ")); } - - return res; -} - -void print_glo_decls() { - int i = 0; - int level; - while (i < glo_decl_ix) { - if (glo_decls[i + 1] == 1) { // Skip inactive declarations - if (glo_decls[i + 2] != 0) { - level = glo_decls[i]; - while (level > 0) { - putchar(' '); putchar(' '); - level -= 1; - } - print_text(glo_decls[i + 2]); - putchar('\n'); - } - } - i += GLO_DECL_ENTRY_SIZE; - } -} +// Line formatter for global declarations +#include "glo_decls.c" // Environment tracking #include "env.c" @@ -1404,7 +1023,7 @@ text comp_rvalue_go(ast node, int context, ast test_side_effects, int outer_op) return wrap_if_needed(true, context, test_side_effects, string_concat(sub1, wrap_str_lit(" += 1")), outer_op, op); } else if (op == MINUS_MINUS_POST) { sub1 = comp_lvalue(child0); - return wrap_if_needed(false, context, test_side_effects,string_concat4(wrap_char('('), sub1, wrap_str_lit(" -= 1)"), wrap_str_lit(" + 1")), outer_op, '+'); + return wrap_if_needed(false, context, test_side_effects, string_concat4(wrap_char('('), sub1, wrap_str_lit(" -= 1)"), wrap_str_lit(" + 1")), outer_op, '+'); } else if (op == PLUS_PLUS_POST) { sub1 = comp_lvalue(child0); return wrap_if_needed(false, context, test_side_effects, string_concat4(wrap_char('('), sub1, wrap_str_lit(" += 1)"), wrap_str_lit(" - 1")), outer_op, '-'); @@ -1853,9 +1472,9 @@ void handle_printf_call(char *format_str, ast params) { // We can't a string to printf directly, it needs to be unpacked first. case 's': if (param == 0) fatal_error("printf: not enough parameters"); - runtime_use_put_pstr = true; // If the format specifier has width or precision, we have to pack the string and call then printf. // Otherwise, we can call _put_pstr directly and avoid the subshell. + runtime_use_put_pstr = true; #ifndef SH_MINIMAL_PRINTF if (has_width || has_precision) { params_text = concatenate_strings_with(params_text, width_text, wrap_char(' ')); // Add width param if needed @@ -1944,11 +1563,11 @@ text comp_fun_call_code(ast node, ast assign_to) { #ifndef SH_INLINE_EXIT else if (name_id == EXIT_ID) { runtime_use_exit = true; } #endif -#ifndef SH_MINIMAL_RUNTIME +#ifndef MINIMAL_RUNTIME else if (name_id == GETCHAR_ID) { runtime_use_getchar = true; } else if (name_id == PRINTF_ID) { runtime_use_printf = true; } #endif -#if !defined(SH_MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) +#if !defined(MINIMAL_RUNTIME) || defined(SUPPORT_STDIN_INPUT) else if (name_id == ISATTY_ID) { runtime_use_isatty = true; } #endif @@ -2668,9 +2287,7 @@ void comp_glo_decl(ast node) { int op = get_op(node); fun_gensym_ix = 0; - if (op == '=') { // Assignments - comp_assignment(get_child_('=', node, 0), get_child_('=', node, 1)); - } else if (op == DECLS) { // Variable declarations + if (op == DECLS) { // Variable declarations // AUTO_KW and REGISTER_KW can simply be ignored. STATIC_KW is the default // storage class for global variables since pnut-sh only supports 1 // translation unit. diff --git a/tests/_sh/characters.c b/tests/_sh/characters.c new file mode 100644 index 00000000..0e549024 --- /dev/null +++ b/tests/_sh/characters.c @@ -0,0 +1,19 @@ +// Make sure all character literals are supported, and the ones that need to be +// escaped are escaped properly. + +#include + +int main() { + putchar('a'); + putchar('1'); + putchar(' '); + putchar('\n'); + putchar('\t'); + putchar('\\'); + putchar('\''); + putchar('\"'); + putchar('$'); + putchar('`'); + putchar('?'); + putchar('\0'); +} \ No newline at end of file diff --git a/tests/_sh/characters.golden b/tests/_sh/characters.golden new file mode 100644 index 00000000..c93f5422 Binary files /dev/null and b/tests/_sh/characters.golden differ diff --git a/text.c b/text.c new file mode 100644 index 00000000..61c933bf --- /dev/null +++ b/text.c @@ -0,0 +1,288 @@ +#define text int +#define TEXT_POOL_SIZE 1000000 + +intptr_t text_pool[TEXT_POOL_SIZE]; +int text_alloc = 1; // Start at 1 because 0 is the empty text + +// Text pool nodes +enum TEXT_NODES { + TEXT_TREE, // Concatenation of texts + TEXT_INTEGER, // Integer to be printed in decimal +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + TEXT_INTEGER_HEX, // Integer to be printed in hexadecimal + TEXT_INTEGER_OCT, // Integer to be printed in octal +#endif + TEXT_STRING, // Pointer to immutable string + TEXT_ESCAPED // Escaped string, used for printf +}; + +// Because concatenating strings is very expensive and a common operation, we +// use a tree structure to represent the concatenated strings. That way, the +// concatenation can be done in O(1). +// At the end of the codegen process, the tree will be flattened into a single +// string. + +// A few macros to help us change the representation of text objects +#define TEXT_FROM_INT(i) i +#define TEXT_FROM_CHAR(i) i +#define TEXT_FROM_PTR(p) ((intptr_t) (p)) +#define TEXT_TO_INT(p) ((int) (p)) +#define TEXT_TO_CHAR(p) ((char) (p)) + +#define wrap_char(c) (-c) + +text wrap_int(const int i) { + if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER); + text_pool[text_alloc + 1] = TEXT_FROM_INT(i); + return (text_alloc += 2) - 2; +} + +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + +text wrap_int_hex(const int i) { + if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER_HEX); + text_pool[text_alloc + 1] = TEXT_FROM_INT(i); + return (text_alloc += 2) - 2; +} + +text wrap_int_oct(const int i) { + if (text_alloc + 2 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_INTEGER_OCT); + text_pool[text_alloc + 1] = TEXT_FROM_INT(i); + return (text_alloc += 2) - 2; +} + +text wrap_integer(const int multiply, const int obj) { + switch (get_op(obj)) { + case INTEGER: + return wrap_int(multiply * -get_val_(INTEGER, obj)); + case INTEGER_HEX: + return wrap_int_hex(multiply * -get_val_(INTEGER_HEX, obj)); + case INTEGER_OCT: + return wrap_int_oct(multiply * -get_val_(INTEGER_OCT, obj)); + default: + fatal_error("wrap_integer: unknown integer type"); + return 0; + } +} +#else +#define wrap_integer(multiply, obj) wrap_int(multiply * -get_val_(INTEGER, obj)) +#endif + +text escape_text(const text t, const bool for_printf) { + if (text_alloc + 3 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_ESCAPED); + text_pool[text_alloc + 1] = TEXT_FROM_INT(t); + text_pool[text_alloc + 2] = TEXT_FROM_INT(for_printf); + return (text_alloc += 3) - 3; +} + +text string_concat(const text t1, const text t2) { + if (text_alloc + 4 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); + text_pool[text_alloc + 1] = TEXT_FROM_INT(2); + text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); + text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); + return (text_alloc += 4) - 4; +} + +text string_concat3(const text t1, const text t2, const text t3) { + if (text_alloc + 5 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); + text_pool[text_alloc + 1] = TEXT_FROM_INT(3); + text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); + text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); + text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); + return (text_alloc += 5) - 5; +} + +text string_concat4(const text t1, const text t2, const text t3, const text t4) { + if (text_alloc + 6 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); + text_pool[text_alloc + 1] = TEXT_FROM_INT(4); + text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); + text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); + text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); + text_pool[text_alloc + 5] = TEXT_FROM_INT(t4); + return (text_alloc += 6) - 6; +} + +text string_concat5(const text t1, const text t2, const text t3, const text t4, const text t5) { + if (text_alloc + 7 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_TREE); + text_pool[text_alloc + 1] = TEXT_FROM_INT(5); + text_pool[text_alloc + 2] = TEXT_FROM_INT(t1); + text_pool[text_alloc + 3] = TEXT_FROM_INT(t2); + text_pool[text_alloc + 4] = TEXT_FROM_INT(t3); + text_pool[text_alloc + 5] = TEXT_FROM_INT(t4); + text_pool[text_alloc + 6] = TEXT_FROM_INT(t5); + return (text_alloc += 7) - 7; +} + +// Dead code but keeping it around in case we need to wrap mutable strings +// text wrap_str(char * const s) { +// int i = 0; +// int result = text_alloc; +// +// text_pool[result] = TEXT_FROM_INT(TEXT_TREE); +// text_alloc += 2; +// while (s[i] != 0) { +// text_pool[text_alloc] = wrap_char(s[i]); +// text_alloc += 1; +// i += 1; +// } +// +// text_pool[result + 1] = TEXT_FROM_INT(i); +// +// return result; +// } + +// Like wrap_str, but assumes that the string is immutable and doesn't need to be copied +text wrap_str_imm(char * const s, char * const end) { + if (text_alloc + 3 >= TEXT_POOL_SIZE) fatal_error("string tree pool overflow"); + text_pool[text_alloc] = TEXT_FROM_INT(TEXT_STRING); + text_pool[text_alloc + 1] = TEXT_FROM_PTR(s); + text_pool[text_alloc + 2] = TEXT_FROM_PTR(end); // end of string address. 0 for null-terminated strings + return (text_alloc += 3) - 3; +} + +text wrap_str_lit(char * const s) { + return wrap_str_imm(s, 0); +} + +text wrap_str_pool(const int ident_symbol) { + return wrap_str_imm(symbol_buf(ident_symbol), symbol_buf_end(ident_symbol)); +} + +text concatenate_strings_with(const text t1, const text t2, const text sep) { + if (t1 == 0) return t2; + if (t2 == 0) return t1; + return string_concat3(t1, sep, t2); +} + +void print_escaped_char(char c, int for_printf) { + // C escape sequences + if (c == '\0') { putchar('\\'); putchar('0'); } + else if (c == '\a') { putchar('\\'); putchar('a'); } + else if (c == '\b') { putchar('\\'); putchar('b'); } + else if (c == '\f') { putchar('\\'); putchar('f'); } + else if (c == '\n') { putchar('\\'); putchar('n'); } + else if (c == '\r') { putchar('\\'); putchar('r'); } + else if (c == '\t') { putchar('\\'); putchar('t'); } + else if (c == '\v') { putchar('\\'); putchar('v'); } + // Shell special characters: $, `, ", ', ?, and newline +#ifdef target_sh + else if (c == '$') { putchar('\\'); putchar('$'); } + else if (c == '`') { putchar('\\'); putchar('`'); } + // backslashes are escaped twice, first by the shell and then by def_str + else if (c == '\\') { putchar('\\'); putchar('\\'); putchar('\\'); putchar('\\'); } +#else + else if (c == '\\') { putchar('\\'); putchar('\\'); } +#endif + else if (c == '"') { putchar('\\'); putchar('"'); } + // when we're escaping a string for shell's printf, % must be escaped + else if (c == '%' && for_printf) { putchar('%'); putchar('%'); } + else putchar(c); +} + +void print_escaped_string(char *string_start, char *string_end, int for_printf) { + if (string_end) { + while (string_start < string_end) { + print_escaped_char(*string_start, for_printf); + string_start += 1; + } + } else { + while (*string_start != 0) { + print_escaped_char(*string_start, for_printf); + string_start += 1; + } + } +} + +void print_escaped_text(text t, bool for_printf) { + int i; + + if (t == 0) return; + + if (t < 0) { // it's a character + print_escaped_char(-t, for_printf); + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_TREE)) { + i = 0; + while (TEXT_FROM_INT(i) < text_pool[t + 1]) { + if (text_pool[t + i + 2] < 0) { + print_escaped_char(-TEXT_TO_CHAR(text_pool[t + i + 2]), for_printf); + } else { + print_escaped_text(TEXT_TO_INT(text_pool[t + i + 2]), for_printf); + } + i += 1; + } + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER)) { + putint(TEXT_TO_INT(text_pool[t + 1])); + } +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_HEX)) { + putchar('0'); putchar('x'); + puthex_unsigned(TEXT_TO_INT(text_pool[t + 1])); + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_OCT)) { + putchar('0'); // Note: This is not supported by zsh by default + putoct_unsigned(TEXT_TO_INT(text_pool[t + 1])); + } +#endif + else if (text_pool[t] == TEXT_FROM_INT(TEXT_STRING)) { + print_escaped_string((char*) text_pool[t + 1], (char*) text_pool[t + 2], for_printf); + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_ESCAPED)) { + fatal_error("Cannot escape a string that is already escaped"); + } else { + fatal_error("print_escaped_text: unexpected string tree node"); + } +} + +void print_text(text t) { + int i; + char *s; + + if (t == 0) return; + + if (t < 0) { // it's a character + putchar(-t); + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_TREE)) { + i = 0; + while (TEXT_FROM_INT(i) < text_pool[t + 1]) { + if (text_pool[t + i + 2] < 0) { + putchar(-TEXT_TO_CHAR(text_pool[t + i + 2])); + } else { + print_text(TEXT_TO_INT(text_pool[t + i + 2])); + } + i += 1; + } + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER)) { + putint(TEXT_TO_INT(text_pool[t + 1])); + } +#ifdef PARSE_NUMERIC_LITERAL_WITH_BASE + else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_HEX)) { + putchar('0'); putchar('x'); + puthex_unsigned(TEXT_TO_INT(text_pool[t + 1])); + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_INTEGER_OCT)) { + putchar('0'); // Note: This is not supported by zsh by default + putoct_unsigned(TEXT_TO_INT(text_pool[t + 1])); + } +#endif + else if (text_pool[t] == TEXT_FROM_INT(TEXT_STRING)) { + if (TEXT_TO_INT(text_pool[t + 2]) == 0) { // null-terminated string + putstr((char*) text_pool[t + 1]); + } else { // string ends at the address in text_pool[t + 2] + s = (char*) text_pool[t + 1]; // start + while (s < (char*) text_pool[t + 2] || *s != 0) { + putchar(*s); + s += 1; + } + } + } else if (text_pool[t] == TEXT_FROM_INT(TEXT_ESCAPED)) { + print_escaped_text(TEXT_TO_INT(text_pool[t + 1]), TEXT_TO_INT(text_pool[t + 2])); + } else { + fatal_error("print_text: unexpected string tree node"); + } +}