diff options
-rw-r--r-- | Makefile | 6 | ||||
-rw-r--r-- | PKGBUILD | 2 | ||||
-rw-r--r-- | changelog.md | 11 | ||||
-rw-r--r-- | include/luma/print.h | 4 | ||||
-rw-r--r-- | include/luma/utf8dec.h | 3 | ||||
-rw-r--r-- | include/luma/utf8enc.h | 3 | ||||
-rw-r--r-- | main | 2 | ||||
-rw-r--r-- | src/luma/print.c | 26 | ||||
-rw-r--r-- | src/luma/utf8dec.c | 91 | ||||
-rw-r--r-- | src/luma/utf8enc.c | 67 | ||||
-rw-r--r-- | src/main.c | 18 |
11 files changed, 166 insertions, 67 deletions
@@ -1,5 +1,10 @@ CC = clang CFLAGS = -std=c2x -Wall -Wextra -Wpedantic -I include -march=native -mtune=native -O3 +ifneq ($(debug),1) +CFLAGS += -DNDEBUG +else +CFLAGS += -g +endif LDFLAGS = SRCS = \ src/luma/print.c \ @@ -14,6 +19,7 @@ HDRS = \ OBJS = $(SRCS:.c=.o) luma: $(OBJS) $(CC) $(LDFLAGS) $^ -o $@ +$(OBJS): $(HDRS) .PHONY: run run: luma ./luma @@ -1,6 +1,6 @@ # Maintainer: Gabriel Jensen pkgname=luma -pkgver=20.0.0 +pkgver=21.0.0 pkgrel=1 pkgdesc="luma programming language - runtime environment" arch=("any") diff --git a/changelog.md b/changelog.md index c5c9875..a8ea9bc 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,10 @@ +# 19 + +* Improve Makefile. +* Improve UTF-8 encoder. +* Complete UTF-8 decoder. +* Create basic print function. + # 18 * Complete UTF-8 encoder. @@ -6,8 +13,8 @@ # 17 * Reformat changelog to Markdown. -* Completely rework codebase (multiple times, in 3 languages). Finally decide on C. -* Split project into three projects: *libluma* (API), *luma* (interpreter), and *luma-docs* (documentation). +* Completely rework codebase (multiple times, in C, C++, Objective-C and Rust). Finally decide on C. +* Split project into three seperate projects: *libluma* (API), *luma* (interpreter), and *luma-docs* (documentation). * Merge with *libluma*. * Create language sample. * Use STDC functions instead of POSIX where possible. diff --git a/include/luma/print.h b/include/luma/print.h index 0105940..227823c 100644 --- a/include/luma/print.h +++ b/include/luma/print.h @@ -1,5 +1,5 @@ # if !defined(LUMA_HDR_PRINT) # define LUMA_HDR_PRINT -# include <uchar.h> -extern void luma_print(char * str,...); +# include <stdint.h> +extern void luma_print(uint32_t * str,...); # endif diff --git a/include/luma/utf8dec.h b/include/luma/utf8dec.h index 56ef640..6d6fbe9 100644 --- a/include/luma/utf8dec.h +++ b/include/luma/utf8dec.h @@ -1,5 +1,6 @@ # if !defined(LUMA_HDR_UTF8DEC) # define LUMA_HDR_UTF8DEC +# include <stddef.h> # include <stdint.h> -extern uint32_t * luma_utf8enc(char const * str); +extern uint32_t * luma_utf8dec(uint8_t const * str,size_t * outszptr); # endif diff --git a/include/luma/utf8enc.h b/include/luma/utf8enc.h index 8b9aa25..5d6d7cf 100644 --- a/include/luma/utf8enc.h +++ b/include/luma/utf8enc.h @@ -1,5 +1,6 @@ # if !defined(LUMA_HDR_UTF8ENC) # define LUMA_HDR_UTF8ENC +# include <stddef.h> # include <stdint.h> -extern uint8_t const * luma_utf8enc(uint32_t * codep); +extern uint8_t const * luma_utf8enc(uint32_t * codep,size_t * outszptr); # endif @@ -1,5 +1,5 @@ luma #0 -fn “main” #0 +fn “main” gæt “std:print” cɑll “std:print” “Hællo ðære!” æx #0 diff --git a/src/luma/print.c b/src/luma/print.c index df53d8a..8d8ac1d 100644 --- a/src/luma/print.c +++ b/src/luma/print.c @@ -1,16 +1,24 @@ # include <luma/utf8enc.h> +# include <stdarg.h> +# include <stdint.h> # include <stdio.h> -# include <string.h> -void luma_print([[maybe_unused]] char * str,...) { - /*for(size_t n = 0x0;;++n) { - if(str[n] == 0x0) { - fwrite(&(char){0xA},0x1,0x1,stdout); +void luma_print(uint32_t * msg,...) { + va_list args; + va_start(args,msg); + for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { + if(msg[n] == (uint32_t){0x0}) { + fwrite(&(uint8_t){0xA},0x1,0x1,stdout); break; } - if(!strcmp(&str[n],"\uFFFD")) { - fwrite(&(char){0x20},0x1,0x1,stdout); + if(msg[n] == (uint32_t){0xFFFD}) { + size_t chrsz = (size_t){0x0}; + uint8_t const * chr = luma_utf8enc((uint32_t[]){va_arg(args,uint32_t),0x0},&chrsz); + fwrite(chr,0x1,chrsz - (size_t){0x1},stdout); continue; } - fwrite(&str[n],0x1,0x1,stdout); - }*/ + size_t chrsz = (size_t){0x0}; + uint8_t const * chr = luma_utf8enc((uint32_t[]){msg[n],0x0,0x0},&chrsz); + fwrite(chr,0x1,chrsz - (size_t){0x1},stdout); + } + va_end(args); } diff --git a/src/luma/utf8dec.c b/src/luma/utf8dec.c index 35ef07e..f6e29be 100644 --- a/src/luma/utf8dec.c +++ b/src/luma/utf8dec.c @@ -1,8 +1,91 @@ # include <luma/utf8dec.h> # include <stdint.h> +# include <stdio.h> # include <stdlib.h> -uint32_t * luma_utf8dec([[maybe_unused]] char const * str) { - uint32_t * utf = malloc(0x4); - utf[0x0] = (uint32_t){0x0}; - return utf; +uint32_t * luma_utf8dec(uint8_t const * str,size_t * outszptr) { + size_t sz = (size_t){0x0}; + size_t outsz = (size_t){0x0}; + for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { // First pass: get size of input array and determine size of output array. + uint8_t const utf = str[n]; + if(utf == (uint8_t){0x0}) { // Null-terminator. + sz = n; + break; + } + if(utf >= (uint8_t){0xF0}) { // Four byte. + outsz += (size_t){0x4}; + n += (size_t){0x3}; + continue; + } + if(utf >= (uint8_t){0xE0}) { // Three bytes. + outsz += (size_t){0x3}; + n += (size_t){0x2}; + continue; + } + if(utf >= (uint8_t){0xC0}) { // Two bytes. + outsz += (size_t){0x2}; + n += (size_t){0x1}; + continue; + } + if(utf >= (uint8_t){0x80}) { // One byte. + outsz += (size_t){0x1}; + continue; + } + // Out of range. + return NULL; + } + outsz += (size_t){0x1}; // Reserve space for null-terminator. + if(outszptr != NULL) { + *outszptr = outsz; + } + uint32_t * codeps = malloc(outsz); + codeps[outsz - (size_t){0x1}] = (uint32_t){0x0}; // Create null-terminator on output array. + for(size_t n = (size_t){0x0}, outn = (size_t){0x0};n < sz;n += (size_t){0x1},outn += (size_t){0x1}) { // Second pass: decode UTF-8. + uint8_t utf = str[n]; + if(utf >= (uint8_t){0xF7}) { // Out of range. + return NULL; + } + if(utf >= (uint8_t){0xF0}) { // Four byte. + uint32_t codep = (uint32_t){(utf ^ 0xF0) << 0x12}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80) << 0xC}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80) << 0x6}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80)}; + codeps[outn] = codep; + continue; + } + if(utf >= (uint8_t){0xE0}) { // Three bytes. + uint32_t codep = (uint32_t){(utf ^ 0xE0) << 0xC}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80) << 0x6}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80)}; + n += (size_t){0x1}; + codeps[outn] = codep; + continue; + } + if(utf >= (uint8_t){0xC0}) { // Two bytes. + uint32_t codep = (uint32_t){(utf ^ 0xC0) << 0x6}; + n += (size_t){0x1}; + utf = str[n]; + codep += (uint32_t){(utf ^ 0x80)}; + n += (size_t){0x1}; + codeps[outn] = codep; + continue; + } + if(utf > (uint8_t){0x7F}) { // One byte. + uint32_t codep = (uint32_t){utf}; + codeps[outn] = codep; + continue; + } + // Out of range. + return NULL; + } + return codeps; } diff --git a/src/luma/utf8enc.c b/src/luma/utf8enc.c index 5012f5c..296b56f 100644 --- a/src/luma/utf8enc.c +++ b/src/luma/utf8enc.c @@ -2,7 +2,7 @@ # include <stdint.h> # include <stdio.h> # include <stdlib.h> -uint8_t const * luma_utf8enc(uint32_t * codeps) { +uint8_t const * luma_utf8enc(uint32_t * codeps,size_t * outszptr) { size_t sz = (size_t){0x0}; // Size of input array (bytes). size_t outsz = (size_t){0x0}; // Size of output array /bytes). for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { // First pass: get size of input array, and determine size of output array. @@ -11,18 +11,18 @@ uint8_t const * luma_utf8enc(uint32_t * codeps) { sz = n; break; } - if(codep > 0x10FFFF) { // Codepoint out of range. + if(codep >= (uint32_t){0x110000}) { // Codepoint out of range. return NULL; } - if(codep > 0xFFFF) { // 4 bytes. + if(codep >= (uint32_t){0x10000}) { // 4 bytes. outsz += (size_t){0x4}; continue; } - if(codep > 0x7FF) { // 3 bytes. + if(codep >= (uint32_t){0x800}) { // 3 bytes. outsz += (size_t){0x3}; continue; } - if(codep > 0x7F) { // 2 bytes. + if(codep >= (uint32_t){0x80}) { // 2 bytes. outsz += (size_t){0x2}; continue; } @@ -30,42 +30,39 @@ uint8_t const * luma_utf8enc(uint32_t * codeps) { outsz += (size_t){0x1}; } outsz += (size_t){0x1}; // Add space for null-terminator. - printf("There are %zu element(s).\n",sz); - printf("The output will have %zu element(s).\n",outsz); - uint8_t * outstr = malloc(outsz); // Allocate space for output array. - outstr[outsz - (size_t){0x1}] = (uint8_t){0x0}; // Create null-terminator on output array. - size_t outn = (size_t){0x0}; // Keep track of position in output array. - for(size_t n = (size_t){0x0};n < sz;n += (size_t){0x1}) { + if(outszptr != NULL) { + *outszptr = outsz; + } + uint8_t * str = malloc(outsz); // Allocate space for output array. + str[outsz - (size_t){0x1}] = (uint8_t){0x0}; // Create null-terminator on output array. + for(size_t n = (size_t){0x0}, outn = (size_t){0x0};n < sz;n += (size_t){0x1},outn += (size_t){0x1}) { // Second pass: encode each codepoint into UTF-8. uint32_t codep = codeps[n]; // Current Unicode codepoint. - if(codep > 0xFFFF) { - outstr[outn] = (uint8_t){0xF0 + (codep >> 0x12)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0xC) & 0x3F)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)}; - outn += (size_t){0x1}; + if(codep >= 0x10000) { // Four bytes. + str[outn] = (uint8_t){0xF0 + (codep >> 0x12)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + ((codep >> 0xC) & 0x3F)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + (codep & 0x3F)}; continue; } - if(codep > 0x7FF) { - outstr[outn] = (uint8_t){0xE0 + (codep >> 0xC)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)}; - outn += (size_t){0x1}; + if(codep >= 0x800) { // Three bytes. + str[outn] = (uint8_t){0xE0 + (codep >> 0xC)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + ((codep >> 0x6) & 0x3F)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + (codep & 0x3F)}; continue; } - if(codep > 0x7F) { - outstr[outn] = (uint8_t){0xC0 + (codep >> 0x6)}; - outn += (size_t){0x1}; - outstr[outn] = (uint8_t){0x80 + ((codep >> 0x0) & 0x3F)}; - outn += (size_t){0x1}; + if(codep >= 0x80) { // Two bytes. + str[outn] = (uint8_t){0xC0 + (codep >> 0x6)}; + outn += (size_t){0x1}; + str[outn] = (uint8_t){0x80 + (codep & 0x3F)}; continue; } - outstr[outn] = codep; - outn += (size_t){0x1}; + // One byte. + str[outn] = codep; } - return (uint8_t const *){outstr}; + return (uint8_t const *){str}; } @@ -1,6 +1,7 @@ # include <locale.h> # include <luma/arch.h> # include <luma/print.h> +# include <luma/utf8dec.h> # include <luma/utf8enc.h> # include <stdint.h> # include <stdio.h> @@ -14,17 +15,12 @@ int main(void) { for(size_t i = (size_t){0x0};i < sizeof code / sizeof code[0x0];++i) { printf("Got code %d.\n",code[i]); } - uint8_t const * msg = luma_utf8enc((uint32_t[]){0x00A2,0x2C,0x939,0x2C,0x10348,0x2C,0x20Ac,0x2C,0x218A,0x2C,0x1F44B,0x0}); - printf("%s\n",msg); - //uint32_t * utf = luma_utf8dec(msg); + uint8_t const * msg = luma_utf8enc((uint32_t[]){0x00A2,0x2C,0x939,0x2C,0x10348,0x2C,0x20AC,0x2C,0x218A,0x2C,0x1F44B,0x0},NULL); + printf("Array: %s\n",msg); free((void *)msg); - /*for(size_t n = (size_t){0x0};;n += (size_t){0x1}) { - if(utf[n] == (uint32_t){0x0}) { - break; - } - printf("%d\n",utf[n]); - } - free((void *)utf);*/ - //luma_print("Hello world. �👋"); + uint32_t * codeps = luma_utf8dec(luma_utf8enc((uint32_t[]){0x1F44B,0x0},NULL),NULL); + printf("It is %u.\n",codeps[0x0]); + luma_print((uint32_t[]){0x48,0x65,0x6C,0x6C,0x6F,0x20,0xFFFD,0x65,0x72,0x65,0x21,0x0},(uint32_t){0xF0}); + free((void *)codeps); exit(EXIT_SUCCESS); } |