summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile6
-rw-r--r--PKGBUILD2
-rw-r--r--changelog.md11
-rw-r--r--include/luma/print.h4
-rw-r--r--include/luma/utf8dec.h3
-rw-r--r--include/luma/utf8enc.h3
-rw-r--r--main2
-rw-r--r--src/luma/print.c26
-rw-r--r--src/luma/utf8dec.c91
-rw-r--r--src/luma/utf8enc.c67
-rw-r--r--src/main.c18
11 files changed, 166 insertions, 67 deletions
diff --git a/Makefile b/Makefile
index 901e743..ea89197 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,10 @@
CC = clang
CFLAGS = -std=c2x -Wall -Wextra -Wpedantic -I include -march=native -mtune=native -O3
+ifneq ($(debug),1)
+CFLAGS += -DNDEBUG
+else
+CFLAGS += -g
+endif
LDFLAGS =
SRCS = \
src/luma/print.c \
@@ -14,6 +19,7 @@ HDRS = \
OBJS = $(SRCS:.c=.o)
luma: $(OBJS)
$(CC) $(LDFLAGS) $^ -o $@
+$(OBJS): $(HDRS)
.PHONY: run
run: luma
./luma
diff --git a/PKGBUILD b/PKGBUILD
index 6092983..7bdcc4e 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,6 +1,6 @@
# Maintainer: Gabriel Jensen
pkgname=luma
-pkgver=20.0.0
+pkgver=21.0.0
pkgrel=1
pkgdesc="luma programming language - runtime environment"
arch=("any")
diff --git a/changelog.md b/changelog.md
index c5c9875..a8ea9bc 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,3 +1,10 @@
+# 19
+
+* Improve Makefile.
+* Improve UTF-8 encoder.
+* Complete UTF-8 decoder.
+* Create basic print function.
+
# 18
* Complete UTF-8 encoder.
@@ -6,8 +13,8 @@
# 17
* Reformat changelog to Markdown.
-* Completely rework codebase (multiple times, in 3 languages). Finally decide on C.
-* Split project into three projects: *libluma* (API), *luma* (interpreter), and *luma-docs* (documentation).
+* Completely rework codebase (multiple times, in C, C++, Objective-C and Rust). Finally decide on C.
+* Split project into three seperate projects: *libluma* (API), *luma* (interpreter), and *luma-docs* (documentation).
* Merge with *libluma*.
* Create language sample.
* Use STDC functions instead of POSIX where possible.
diff --git a/include/luma/print.h b/include/luma/print.h
index 0105940..227823c 100644
--- a/include/luma/print.h
+++ b/include/luma/print.h
@@ -1,5 +1,5 @@
# if !defined(LUMA_HDR_PRINT)
# define LUMA_HDR_PRINT
-# include <uchar.h>
-extern void luma_print(char * str,...);
+# include <stdint.h>
+extern void luma_print(uint32_t * str,...);
# endif
diff --git a/include/luma/utf8dec.h b/include/luma/utf8dec.h
index 56ef640..6d6fbe9 100644
--- a/include/luma/utf8dec.h
+++ b/include/luma/utf8dec.h
@@ -1,5 +1,6 @@
# if !defined(LUMA_HDR_UTF8DEC)
# define LUMA_HDR_UTF8DEC
+# include <stddef.h>
# include <stdint.h>
-extern uint32_t * luma_utf8enc(char const * str);
+extern uint32_t * luma_utf8dec(uint8_t const * str,size_t * outszptr);
# endif
diff --git a/include/luma/utf8enc.h b/include/luma/utf8enc.h
index 8b9aa25..5d6d7cf 100644
--- a/include/luma/utf8enc.h
+++ b/include/luma/utf8enc.h
@@ -1,5 +1,6 @@
# if !defined(LUMA_HDR_UTF8ENC)
# define LUMA_HDR_UTF8ENC
+# include <stddef.h>
# include <stdint.h>
-extern uint8_t const * luma_utf8enc(uint32_t * codep);
+extern uint8_t const * luma_utf8enc(uint32_t * codep,size_t * outszptr);
# endif
diff --git a/main b/main
index 21a0fbb..0eaa3ce 100644
--- a/main
+++ b/main
@@ -1,5 +1,5 @@
luma #0
-fn “main” #0
+fn “main”
gæt “std:print”
cɑll “std:print” “Hællo ðære!”
æx #0
diff --git a/src/luma/print.c b/src/luma/print.c
index df53d8a..8d8ac1d 100644
--- a/src/luma/print.c
+++ b/src/luma/print.c
@@ -1,16 +1,24 @@
# include <luma/utf8enc.h>
+# include <stdarg.h>
+# include <stdint.h>
# include <stdio.h>
-# include <string.h>
-void luma_print([[maybe_unused]] char * str,...) {
- /*for(size_t n = 0x0;;++n) {
- if(str[n] == 0x0) {
- fwrite(&(char){0xA},0x1,0x1,stdout);
+void luma_print(uint32_t * msg,...) {
+ va_list args;
+ va_start(args,msg);
+ for(size_t n = (size_t){0x0};;n += (size_t){0x1}) {
+ if(msg[n] == (uint32_t){0x0}) {
+ fwrite(&(uint8_t){0xA},0x1,0x1,stdout);
break;
}
- if(!strcmp(&str[n],"\uFFFD")) {
- fwrite(&(char){0x20},0x1,0x1,stdout);
+ if(msg[n] == (uint32_t){0xFFFD}) {
+ size_t chrsz = (size_t){0x0};
+ uint8_t const * chr = luma_utf8enc((uint32_t[]){va_arg(args,uint32_t),0x0},&chrsz);
+ fwrite(chr,0x1,chrsz - (size_t){0x1},stdout);
continue;
}
- fwrite(&str[n],0x1,0x1,stdout);
- }*/
+ size_t chrsz = (size_t){0x0};
+ uint8_t const * chr = luma_utf8enc((uint32_t[]){msg[n],0x0,0x0},&chrsz);
+ fwrite(chr,0x1,chrsz - (size_t){0x1},stdout);
+ }
+ va_end(args);
}
diff --git a/src/luma/utf8dec.c b/src/luma/utf8dec.c
index 35ef07e..f6e29be 100644
--- a/src/luma/utf8dec.c
+++ b/src/luma/utf8dec.c
@@ -1,8 +1,91 @@
# include <luma/utf8dec.h>
# include <stdint.h>
+# include <stdio.h>
# include <stdlib.h>
-uint32_t * luma_utf8dec([[maybe_unused]] char const * str) {
- uint32_t * utf = malloc(0x4);
- utf[0x0] = (uint32_t){0x0};
- return utf;
+uint32_t * luma_utf8dec(uint8_t const * str,size_t * outszptr) {
+ size_t sz = (size_t){0x0};
+ size_t outsz = (size_t){0x0};
+ for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { // First pass: get size of input array and determine size of output array.
+ uint8_t const utf = str[n];
+ if(utf == (uint8_t){0x0}) { // Null-terminator.
+ sz = n;
+ break;
+ }
+ if(utf >= (uint8_t){0xF0}) { // Four byte.
+ outsz += (size_t){0x4};
+ n += (size_t){0x3};
+ continue;
+ }
+ if(utf >= (uint8_t){0xE0}) { // Three bytes.
+ outsz += (size_t){0x3};
+ n += (size_t){0x2};
+ continue;
+ }
+ if(utf >= (uint8_t){0xC0}) { // Two bytes.
+ outsz += (size_t){0x2};
+ n += (size_t){0x1};
+ continue;
+ }
+ if(utf >= (uint8_t){0x80}) { // One byte.
+ outsz += (size_t){0x1};
+ continue;
+ }
+ // Out of range.
+ return NULL;
+ }
+ outsz += (size_t){0x1}; // Reserve space for null-terminator.
+ if(outszptr != NULL) {
+ *outszptr = outsz;
+ }
+ uint32_t * codeps = malloc(outsz);
+ codeps[outsz - (size_t){0x1}] = (uint32_t){0x0}; // Create null-terminator on output array.
+ for(size_t n = (size_t){0x0}, outn = (size_t){0x0};n < sz;n += (size_t){0x1},outn += (size_t){0x1}) { // Second pass: decode UTF-8.
+ uint8_t utf = str[n];
+ if(utf >= (uint8_t){0xF7}) { // Out of range.
+ return NULL;
+ }
+ if(utf >= (uint8_t){0xF0}) { // Four byte.
+ uint32_t codep = (uint32_t){(utf ^ 0xF0) << 0x12};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80) << 0xC};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80) << 0x6};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80)};
+ codeps[outn] = codep;
+ continue;
+ }
+ if(utf >= (uint8_t){0xE0}) { // Three bytes.
+ uint32_t codep = (uint32_t){(utf ^ 0xE0) << 0xC};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80) << 0x6};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80)};
+ n += (size_t){0x1};
+ codeps[outn] = codep;
+ continue;
+ }
+ if(utf >= (uint8_t){0xC0}) { // Two bytes.
+ uint32_t codep = (uint32_t){(utf ^ 0xC0) << 0x6};
+ n += (size_t){0x1};
+ utf = str[n];
+ codep += (uint32_t){(utf ^ 0x80)};
+ n += (size_t){0x1};
+ codeps[outn] = codep;
+ continue;
+ }
+ if(utf > (uint8_t){0x7F}) { // One byte.
+ uint32_t codep = (uint32_t){utf};
+ codeps[outn] = codep;
+ continue;
+ }
+ // Out of range.
+ return NULL;
+ }
+ return codeps;
}
diff --git a/src/luma/utf8enc.c b/src/luma/utf8enc.c
index 5012f5c..296b56f 100644
--- a/src/luma/utf8enc.c
+++ b/src/luma/utf8enc.c
@@ -2,7 +2,7 @@
# include <stdint.h>
# include <stdio.h>
# include <stdlib.h>
-uint8_t const * luma_utf8enc(uint32_t * codeps) {
+uint8_t const * luma_utf8enc(uint32_t * codeps,size_t * outszptr) {
size_t sz = (size_t){0x0}; // Size of input array (bytes).
size_t outsz = (size_t){0x0}; // Size of output array /bytes).
for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { // First pass: get size of input array, and determine size of output array.
@@ -11,18 +11,18 @@ uint8_t const * luma_utf8enc(uint32_t * codeps) {
sz = n;
break;
}
- if(codep > 0x10FFFF) { // Codepoint out of range.
+ if(codep >= (uint32_t){0x110000}) { // Codepoint out of range.
return NULL;
}
- if(codep > 0xFFFF) { // 4 bytes.
+ if(codep >= (uint32_t){0x10000}) { // 4 bytes.
outsz += (size_t){0x4};
continue;
}
- if(codep > 0x7FF) { // 3 bytes.
+ if(codep >= (uint32_t){0x800}) { // 3 bytes.
outsz += (size_t){0x3};
continue;
}
- if(codep > 0x7F) { // 2 bytes.
+ if(codep >= (uint32_t){0x80}) { // 2 bytes.
outsz += (size_t){0x2};
continue;
}
@@ -30,42 +30,39 @@ uint8_t const * luma_utf8enc(uint32_t * codeps) {
outsz += (size_t){0x1};
}
outsz += (size_t){0x1}; // Add space for null-terminator.
- printf("There are %zu element(s).\n",sz);
- printf("The output will have %zu element(s).\n",outsz);
- uint8_t * outstr = malloc(outsz); // Allocate space for output array.
- outstr[outsz - (size_t){0x1}] = (uint8_t){0x0}; // Create null-terminator on output array.
- size_t outn = (size_t){0x0}; // Keep track of position in output array.
- for(size_t n = (size_t){0x0};n < sz;n += (size_t){0x1}) {
+ if(outszptr != NULL) {
+ *outszptr = outsz;
+ }
+ uint8_t * str = malloc(outsz); // Allocate space for output array.
+ str[outsz - (size_t){0x1}] = (uint8_t){0x0}; // Create null-terminator on output array.
+ for(size_t n = (size_t){0x0}, outn = (size_t){0x0};n < sz;n += (size_t){0x1},outn += (size_t){0x1}) { // Second pass: encode each codepoint into UTF-8.
uint32_t codep = codeps[n]; // Current Unicode codepoint.
- if(codep > 0xFFFF) {
- outstr[outn] = (uint8_t){0xF0 + (codep >> 0x12)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0xC) & 0x3F)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)};
- outn += (size_t){0x1};
+ if(codep >= 0x10000) { // Four bytes.
+ str[outn] = (uint8_t){0xF0 + (codep >> 0x12)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + ((codep >> 0xC) & 0x3F)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + (codep & 0x3F)};
continue;
}
- if(codep > 0x7FF) {
- outstr[outn] = (uint8_t){0xE0 + (codep >> 0xC)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)};
- outn += (size_t){0x1};
+ if(codep >= 0x800) { // Three bytes.
+ str[outn] = (uint8_t){0xE0 + (codep >> 0xC)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + (codep & 0x3F)};
continue;
}
- if(codep > 0x7F) {
- outstr[outn] = (uint8_t){0xC0 + (codep >> 0x6)};
- outn += (size_t){0x1};
- outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)};
- outn += (size_t){0x1};
+ if(codep >= 0x80) { // Two bytes.
+ str[outn] = (uint8_t){0xC0 + (codep >> 0x6)};
+ outn += (size_t){0x1};
+ str[outn] = (uint8_t){0x80 + (codep & 0x3F)};
continue;
}
- outstr[outn] = codep;
- outn += (size_t){0x1};
+ // One byte.
+ str[outn] = codep;
}
- return (uint8_t const *){outstr};
+ return (uint8_t const *){str};
}
diff --git a/src/main.c b/src/main.c
index 971a861..e5c7ef6 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1,6 +1,7 @@
# include <locale.h>
# include <luma/arch.h>
# include <luma/print.h>
+# include <luma/utf8dec.h>
# include <luma/utf8enc.h>
# include <stdint.h>
# include <stdio.h>
@@ -14,17 +15,12 @@ int main(void) {
for(size_t i = (size_t){0x0};i < sizeof code / sizeof code[0x0];++i) {
printf("Got code %d.\n",code[i]);
}
- uint8_t const * msg = luma_utf8enc((uint32_t[]){0x00A2,0x2C,0x939,0x2C,0x10348,0x2C,0x20Ac,0x2C,0x218A,0x2C,0x1F44B,0x0});
- printf("%s\n",msg);
- //uint32_t * utf = luma_utf8dec(msg);
+ uint8_t const * msg = luma_utf8enc((uint32_t[]){0x00A2,0x2C,0x939,0x2C,0x10348,0x2C,0x20AC,0x2C,0x218A,0x2C,0x1F44B,0x0},NULL);
+ printf("Array: %s\n",msg);
free((void *)msg);
- /*for(size_t n = (size_t){0x0};;n += (size_t){0x1}) {
- if(utf[n] == (uint32_t){0x0}) {
- break;
- }
- printf("%d\n",utf[n]);
- }
- free((void *)utf);*/
- //luma_print("Hello world. �👋");
+ uint32_t * codeps = luma_utf8dec(luma_utf8enc((uint32_t[]){0x1F44B,0x0},NULL),NULL);
+ printf("It is %u.\n",codeps[0x0]);
+ luma_print((uint32_t[]){0x48,0x65,0x6C,0x6C,0x6F,0x20,0xFFFD,0x65,0x72,0x65,0x21,0x0},(uint32_t){0xF0});
+ free((void *)codeps);
exit(EXIT_SUCCESS);
}