liamHowatt
diff --git a/‎.gitignore
Lines changed: 5 additions & 1 deletion b/‎.gitignore
Lines changed: 5 additions & 1 deletion
diff --git a/‎.gitmodules
Lines changed: 1 addition & 1 deletion b/‎.gitmodules
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile
Lines changed: 15 additions & 6 deletions b/‎Makefile
Lines changed: 15 additions & 6 deletions
diff --git a/‎README.md
Lines changed: 107 additions & 2 deletions b/‎README.md
Lines changed: 107 additions & 2 deletions
diff --git a/‎benchmarks/pixel_compress/.gitignore
Lines changed: 7 additions & 0 deletions b/‎benchmarks/pixel_compress/.gitignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/pixel_compress/README.md
Lines changed: 38 additions & 0 deletions b/‎benchmarks/pixel_compress/README.md
Lines changed: 38 additions & 0 deletions
diff --git a/‎benchmarks/pixel_compress/c.c
Lines changed: 73 additions & 0 deletions b/‎benchmarks/pixel_compress/c.c
Lines changed: 73 additions & 0 deletions
diff --git a/‎benchmarks/pixel_compress/fs.fs
Lines changed: 86 additions & 0 deletions b/‎benchmarks/pixel_compress/fs.fs
Lines changed: 86 additions & 0 deletions
diff --git a/‎benchmarks/pixel_compress/rgb565data.bin
Lines changed: 6 additions & 0 deletions b/‎benchmarks/pixel_compress/rgb565data.bin
Lines changed: 6 additions & 0 deletions
diff --git a/‎compile-and-run.sh
Lines changed: 4 additions & 4 deletions b/‎compile-and-run.sh
Lines changed: 4 additions & 4 deletions
@@ -1,3 +1,7 @@
 compile-exe
 execute-exe
-*.bin
+/*.bin
+x86-32_backend_generated.c
+x86-32_backend_generated.h
+x86-32_engine_asm.o
+__pycache__
@@ -1,3 +1,3 @@
 [submodule "forth_programs/lvgl/lvgl"]
 	path = forth_programs/lvgl/lvgl
-	url = https://github.com/lvgl/lvgl.git
+	url = https://github.com/liamHowatt/lvgl.git
@@ -1,13 +1,22 @@
 all: compile-exe execute-exe
 
-compile-exe: mcp_forth.h compile.c compile-exe.c num.c vm_backend.c
-	gcc -Wall -fsanitize=address -g compile.c compile-exe.c num.c vm_backend.c -o compile-exe
+x86-32_backend_generated.c: x86_32_backend_generator.py nasm.py
+	python3 x86_32_backend_generator.py
 
-execute-exe: mcp_forth.h execute.c execute-exe.c num.c vm_engine.c
-	gcc -m32 -Wall -fsanitize=address -g execute.c execute-exe.c num.c vm_engine.c -o execute-exe
+compile-exe: mcp_forth.h mcp_forth.c compile.c compile-exe.c vm_backend.c x86-32_backend.c x86-32_backend_generated.c x86-32_backend_generated.h
+	gcc -m32 -Wall -fsanitize=address -g mcp_forth.c compile.c compile-exe.c vm_backend.c x86-32_backend.c x86-32_backend_generated.c -o compile-exe
+
+x86-32_engine_asm.o: x86-32_engine_asm.s
+	nasm -felf32 -o x86-32_engine_asm.o x86-32_engine_asm.s
+
+execute-exe: mcp_forth.h mcp_forth.c execute-exe.c runtime_io.c runtime_time.c runtime_string.c runtime_process.c runtime_file.c vm_engine.c x86-32_engine.c x86-32_engine_asm.o
+	gcc -m32 -Wall -fsanitize=address -g mcp_forth.c execute-exe.c runtime_io.c runtime_time.c runtime_string.c runtime_process.c runtime_file.c vm_engine.c x86-32_engine.c x86-32_engine_asm.o -o execute-exe
 
 test-simple: all
-	find forth_programs/simple -maxdepth 1 -type f | xargs -I{} ./compile-and-run.sh {}
+	find forth_programs/simple -maxdepth 1 -type f | xargs -I{} ./compile-and-run.sh vm {}
+
+test-simple-x86: all
+	find forth_programs/simple -maxdepth 1 -type f | xargs -I{} ./compile-and-run.sh x86 {}
 
 clean:
-	rm -f compile-exe execute-exe *.bin
+	rm -f compile-exe execute-exe x86-32_backend_generated.c x86-32_backend_generated.h x86-32_engine_asm.o
@@ -10,6 +10,111 @@ make test-simple
 How to run `life.fs` and the hash_xors
 
 ```sh
-./compile-and-run.sh forth_programs/life/life.fs < forth_programs/life/starting_board.txt
-./compile-and-run.sh forth_programs/hash_xor/hash_xor.fs < forth_programs/hash_xor/hash_input.txt
+./compile-and-run.sh vm forth_programs/life/life.fs < forth_programs/life/starting_board.txt
+./compile-and-run.sh vm forth_programs/hash_xor/hash_xor.fs < forth_programs/hash_xor/hash_input.txt
 ```
+
+## Performance Measurements
+
+| Test                             | Gforth  | mcp-forth vm -O3 | mcp-forth x86 -O3 | C equivalent -O3 |
+| -------------------------------- | ------- | ---------------- | ----------------- | ---------------- |
+| SPI pixel data compression       | 12.754s | 1m48.735s        | 12.978s           | 0.432s           |
+
+See the bemchmarks directory for the test source files.
+
+## Why
+
+I need portable driver code.
+
+An imaginary display module has a Forth program in its ROM which is retrieved by the host
+and executed to copy updated areas to the display. This display is not trivially controlled.
+It receives a compressed stream of data over SPI. The Forth program has a function with
+a host-expected signature like `( x1 y1 x2 y2 src -- )` which implements the compression
+algorithm and uses host-provided facilities to achieve the SPI transmission. It needs
+to be fast so mcp-forth must be capable of producing somewhat optimal machine code. Forth is
+the chosen language instead of C because I need the compiler to use a minimal amount of
+memory to do the compilation. The mcp-forth compiler is intended to run on hosts which are
+MCUs with RAM around the range of 100 kB to 10 MB. A secondary requirement is the host cross-compiling
+Forth programs to run on peripheral MCUs with as little as 8 kB of RAM.
+
+## Supported Architectures
+
+The bytecode VM is one of the available compiler backends for ease of development and while
+having some freestanding merit such as being portable to platforms for which there isn't a
+native backend yet. It will likely always boast the smallest binary size across architectures
+due to using a stream of variable-length numbers as its encoding for opcodes and operands.
+It is the default choice for testing new Forth code due to having over/underflow/run
+checks where the other backends may forgo safety in favor speed. In summary, it is the default
+choice unless speed is a requirement.
+
+Currently supported architectures:
+
+- Interpreted bytecode VM (explained above)
+- x86-32
+
+Planned:
+
+- ARM Thumb (Cortex M0+)
+- Xtensa (LX6, i.e. ESP32)
+
+## 32 Bits
+
+For simplicity, mcp-forth only supports 32 bit. The C code that implements the runtimes assumes
+that both `int` and `void *` are 32 bits wide. The Forth cell size in mcp-forth is always 4 bytes. Pointers
+can be transparently handled as integers. The compiler can work on 64 bit machines so cross-compiling
+Forth programs on a 64 bit host is possible but running the output is only possible if
+the host supports some kind of 32 bit mode where pointers are 32 bits wide. This means that
+Apple M1, M2, M3, and M4 processors cannot execute the output of the mcp-forth compiler even with the
+VM runtime because they have no support for 32 bit programs. An emulator such as Qemu is required.
+
+## Non-standard Quirks
+
+- `C' <word>` works like `' <word>` except it creates a C function pointer from the word so that
+  Forth words can be used as C callback functions. The number of parameters and optional
+  return value is derived from the `( -- )` signature and an error is raised at compile time
+  if the signature is missing. The `( -- )` signature has no other semantic meaning besides this.
+- Currently, defined words must only be used after they're defined or else a compile time
+  error is raised.
+- Any word that was not found at compile time is a runtime dependency and must be provided by
+  the runtime.
+- Gforth's "compile time only words" can be used outside of functions in mcp-forth.
+- `UNLOOP` is not required (and will be a no-op if added in the future)
+
+## Minutia
+
+### Iterative "Fragment Solving"
+
+Fragments aka snippets of machine code have variable sizes depending on their operands. If a jump
+instruction jumps somewhere nearby, it may only use 1 byte to encode the offset, otherwise 4 bytes.
+Literal values are similar. An immediate literal may be loaded into a register differently
+depending on its size. Some architectures require multiple instructions to load larger immediate
+literal values.
+
+Given that the jump distance may not be known at the time of a jump fragment's creation, the
+collection of all fragments at the end of compilation must be solved in an iterative way to
+achieve optimal packing.
+
+Question: will iterative solving ever cause the compiler to hang in an infinite loop that it can't solve?
+
+### Optimizing Compiler Memory Usage
+
+The compiler allocates a few arrays which it repeatedly appends elements to during compilation
+and resizes them when their capacities are exceeded. There are no small allocations since the overhead
+of N allocations of a small struct may be greater than an allocated contiguous array of N small structs.
+
+Strings referring to source code tokens are not allocated arrays of bytes.
+They are pointers into the source code and a length.
+
+Since struct references are always being invalidated due to array resizing, struct references
+are stored as array indices instead of pointers.
+
+Question: is it a good or bad idea to reduce memory usage by:
+
+- Storing strings as only a pointer with no length. The strings are whitespace-terminated since they
+  point inside the source code. There is a special case for a string that is the last token in the
+  source with no following whitespace.
+- Extending the previous point, should 16 bit offsets into the source be used instead of 32 bit pointers?
+- For indices into arrays of structs, should 16 bit indices be used instead of 32 bit ints?
+- For a case where less-than-32-bit integer types are used to store offsets, can all the members of
+  an array of offsets be dynamically promoted as needed? The first offset that exceeds 65535 would
+  cause the array to be converted from an array of 16 bit offsets to an array of 32 bit offsets.
@@ -0,0 +1,7 @@
+c
+fs.vm.bin
+fs.x86.bin
+res_c
+res_gforth
+res_m4_vm
+res_m4_x86
@@ -0,0 +1,38 @@
+The results of this benchmark are in the root README.md
+
+This test compresses a pixel buffer and outputs
+the to stdout. the test is run 1000 times
+redundantly, only outputting once.
+
+The C code is based on this: https://bitbucket.org/4DPi/gen4-hats/src/236d9c064b06640f6a33be649ddd0aae147e0239/4d-hats.c#lines-781:830
+
+```sh
+gcc -Wall -O3 c.c -o c
+time ./c < ~/Downloads/rgb565data.bin > res_c
+```
+
+the Forth version reads the input file internally
+from a hardcoded path.
+
+```sh
+time gforth fs.fs -e bye > res_gforth
+```
+
+In the Makefile entry for `execute-exe`, replace
+`-fsanitize=address -g` with `-O3`.
+
+```sh
+../../compile-exe vm fs.fs fs.vm.bin
+time ../../execute-exe vm fs.vm.bin > res_m4_vm
+
+../../compile-exe x86 fs.fs fs.x86.bin
+time ../../execute-exe x86 fs.x86.bin > res_m4_x86
+```
+
+assert all the outputs are the same.
+
+```sh
+cmp res_c res_gforth
+cmp res_c res_m4_vm
+cmp res_c res_m4_x86
+```
@@ -0,0 +1,73 @@
+#include <stdint.h>
+#include <stdio.h>
+
+static struct {uint8_t buf[1024 * 1024];} lcdpi_spiblock_dma;
+
+static void lcdpi_compress(int *ptr, unsigned short *pixel_buffer, int pixel_buffer_len, unsigned short *codes, unsigned short speedup)
+{
+	int j;
+	int x;
+	unsigned short value, last_value, mask;
+	int repeated;
+
+	mask = (codes[0] & 0x8000) ^ 0x8000;
+	value = (((pixel_buffer[0] & 0xffc0) >> 1) + (pixel_buffer[0] & 0x001f)) | mask;
+	lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = (value >> 8);
+	lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = (value) & 0xff;
+	(*ptr)++;
+	last_value = value;
+	repeated = 0;
+	for (x = 1; x < pixel_buffer_len; x++) {
+		value = (((pixel_buffer[x] & 0xffc0) >> 1) + (pixel_buffer[x] & 0x001f)) | mask;
+		if (value != last_value) {
+			for (j = 0; j < (repeated / speedup); j++) {
+				lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = codes[speedup] >> 8;
+				lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = codes[speedup] &0xff;
+				(*ptr)++;
+			}
+
+			if(repeated % speedup) {
+				lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = codes[repeated % speedup] >> 8;
+				lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = codes[repeated % speedup] &0xff;
+				(*ptr)++;
+			}
+
+			lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = (value >> 8);
+			lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = (value) & 0xff;
+			(*ptr)++;
+			last_value = value;
+			repeated = 0;
+		} else {
+			repeated++;
+		}
+	}
+	for (j = 0; j < repeated / speedup; j++) {
+		lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = codes[speedup] >> 8;
+		lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = codes[speedup] &0xff;
+		(*ptr)++;
+	}
+
+	if(repeated % speedup) {
+		lcdpi_spiblock_dma.buf[((*ptr) * 2) + 1] = codes[repeated % speedup] >> 8;
+		lcdpi_spiblock_dma.buf[((*ptr) * 2) + 2] = codes[repeated % speedup] &0xff;
+		(*ptr)++;
+	}
+}
+
+int main()
+{
+    int ptr = 0;
+    static uint8_t pb[1024 * 1024] __attribute__ ((aligned (2)));
+    int pbl = 0;
+    int c;
+    while((c = getchar()) != EOF) {
+        pb[pbl++] = c;
+    }
+    lcdpi_spiblock_dma.buf[0] = 0x42;
+    unsigned short codes[] = {0xaaab, 0x80ff, 0x8cff, 0x8ccf, 0x98c9, 0xaabf, 0xaaaf, 0xaaab};
+    for (int i = 0 ; i < 1000 ; i++) {
+        ptr = 0;
+        lcdpi_compress(&ptr, (unsigned short *) pb, pbl / 2, codes, 7);
+    }
+    fwrite(lcdpi_spiblock_dma.buf, 1, ptr * 2 + 1, stdout);
+}
@@ -0,0 +1,86 @@
+: compute_value ( hw -- hw )
+	dup 0xffc0 and 2/
+	swap 0x001f and
+	or
+;
+
+: pre_inc2 ( addr -- addr+2 addr+2 )
+	2 + dup
+;
+
+: store ( dst val -- )
+	2dup swap 1+ c!
+	8 rshift swap c!
+;
+
+variable codes
+	align here codes !
+	0xaaab , 0x80ff , 0x8cff , 0x8ccf , 0x98c9 , 0xaabf , 0xaaaf ,
+variable last_value
+variable repeated
+variable start
+
+: helper
+	repeated @ 7 / dup 0<> if 0 do
+		pre_inc2 0xaaab store
+	loop else drop then
+
+	repeated @ 7 mod dup 0<> if
+		>r pre_inc2 r> cells codes @ + @ store
+	else drop then
+;
+
+: lcdpi_compress ( src dst src_len -- out_len )
+	>r
+	dup start !
+	1+
+	swap pre_inc2 w@ compute_value last_value ! swap
+	pre_inc2 last_value @ store
+	0 repeated !
+	r> 1 do
+		swap pre_inc2 w@ compute_value >r swap r>
+		dup last_value @ <> if
+			last_value !
+
+			helper
+
+			pre_inc2 last_value @ store
+
+			0 repeated !
+		else
+			drop
+			1 repeated +!
+		then
+	loop
+
+	helper
+
+	swap drop
+	start @ -
+;
+
+: check_error ( wior -- )
+	0<> if ." error" cr bye then
+;
+
+variable byte_read
+: read_file
+	s" /home/liam/Downloads/rgb565data.bin" r/o open-file check_error
+	begin byte_read 1 2 pick read-file check_error 0> while byte_read @ c, repeat
+	close-file check_error
+;
+
+: main
+	here 1024 dup * allot
+	66 over c!
+	dup align here read_file here over - 2/ >r 2 - swap 2 - r>
+	-1
+	1000 0 do
+		drop
+		2 pick 2 pick 2 pick lcdpi_compress
+	loop
+	4 pick swap
+	type
+;
+
+main
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-BASENAME=$(basename $1 .fs)
+BASENAME=$(basename $2 .fs)
 BINNAME="${BASENAME}.bin"
-echo $1
-./compile-exe vm $1 $BINNAME
-./execute-exe vm $BINNAME
+echo $2
+./compile-exe $1 $2 $BINNAME
+./execute-exe $1 $BINNAME
 echo