Convertion utility of the database of language orthographies rewriten…

… in perl.
virxkane · Jul 7, 2021 · aeb8741 · aeb8741
1 parent a779a56
commit aeb8741
Show file tree

Hide file tree

Showing 262 changed files with 690 additions and 1,114 deletions.
diff --git a/fc-lang_conv/CMakeLists.txt b/fc-lang_conv/CMakeLists.txt
@@ -2,27 +2,8 @@
 # Find includes in corresponding build directories
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 
-set(EXE_NAME fc-lang_conv)
-
 set(SRC_LIST
-	main.c
+	fc-lang-data.c
 )
 
-if(WIN32)
-	set(SRC_LIST ${SRC_LIST}
-		getline_win32.c
-	)
-endif(WIN32)
-
-set(LDADD_LIBS)
-if(WIN32)
-	# -mconsole -> console application (with terminal screen)
-	set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mconsole")
-endif(WIN32)
-
-add_executable(${EXE_NAME} WIN32 ${SRC_LIST})
-target_link_libraries(${EXE_NAME} ${LDADD_LIBS})
-
-add_library(fc-lang-cat STATIC fc-lang-cat.c)
-
-configure_file(update.sh.cmake ${CMAKE_CURRENT_BINARY_DIR}/update.sh @ONLY)
+add_library(fc-lang-data STATIC ${SRC_LIST})
diff --git a/fc-lang_conv/conv.pl b/fc-lang_conv/conv.pl
@@ -0,0 +1,323 @@
+#!/usr/bin/perl
+
+$fc_lang_dir="../fc-lang";
+$fc_lang_conv_dir="files";
+
+$cat_srcfile="fc-lang-data.c";
+$cat_hdrfile="fc-lang-data.h";
+
+# By defifnition this is invalid code point.
+use constant CODE_IN_RANGE => 0xF0F0FFFF;
+
+sub processFile($$$);
+sub parseFile($$$);
+
+@orth_files=();
+if (opendir(my $dirh, ${fc_lang_dir})) {
+    my $fname = readdir($dirh);
+    while (defined($fname)) {
+        if ($fname =~ m/^.*\.orth$/) {
+            push @orth_files, $fname;
+        }
+        $fname = readdir($dirh);
+    }
+    closedir($dirh);
+    @orth_files = sort(@orth_files);
+} else {
+    die "Failed to open directory!\n";
+}
+
+$count=0;
+@good_src_files=();
+foreach my $fname (@orth_files) {
+    if ($fname =~ m/^(.*)\.orth$/) {
+        my $langtag=$1;
+        my $c_src_name = "${langtag}_orth.c";
+        if (processFile($langtag, "${fc_lang_dir}/${fname}", "${fc_lang_conv_dir}/${c_src_name}")) {
+            push @good_src_files, ${c_src_name};
+            $count++;
+        }
+    }
+}
+
+# Create catalog header file
+if (!open(OUTF, "> $cat_hdrfile")) {
+	die "Unable to open $cat_hdrfile";
+}
+print OUTF<<EOF;
+// FontConfig database of language orthographies.
+// License: Public Domain.
+// This file is autogenerated from fc-lang database.
+// https://www.freedesktop.org/wiki/Software/fontconfig/
+// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
+// by convert utility from https://github.com/virxkane/freetype_textdraw
+
+#ifndef FC_LANG_DATA_H
+#define FC_LANG_DATA_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FC_LANG_DATA_SZ  $count
+
+struct fc_lang_rec
+{
+	const char* lang_code;
+	const unsigned int char_set_sz;
+	const unsigned int* char_set;
+};
+
+/**
+ * \@brief Return pointer to FontConfig database of language orthographies
+ * \@return array of fc_lang_rec records.
+ */
+const struct fc_lang_rec* get_fc_lang_data();
+
+/**
+ * \@brief Get count of records in the FontConfig database of language orthographies.
+ * \@return Count of records in array.
+ */
+unsigned int get_fc_lang_data_size();
+
+/**
+ * \@brief Find language in database by code
+ * \@param lang_code language code is exactly as it appears in the fc_lang catalog.
+ * \@return Pointer to fc_lang_rec instance if language found, NULL otherwise.
+ */
+const struct fc_lang_rec* fc_lang_find(const char* lang_code);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // FC_LANG_DATA_H
+EOF
+close(OUTF);
+
+# Create catalog source file
+if (!open(OUTF, "> $cat_srcfile")) {
+	die "Unable to open $cat_srcfile";
+}
+print OUTF<<EOF;
+// FontConfig database of language orthographies.
+// License: Public Domain.
+// This file is autogenerated from fc-lang database.
+// https://www.freedesktop.org/wiki/Software/fontconfig/
+// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
+// by convert utility from https://github.com/virxkane/freetype_textdraw
+
+#include <string.h>
+
+#include "$cat_hdrfile"
+
+EOF
+
+foreach my $c_src_name (@good_src_files) {
+    print OUTF "#include \"${fc_lang_conv_dir}/${c_src_name}\"\n";
+}
+print OUTF "\n";
+print OUTF "static const struct fc_lang_rec fc_lang_data[] = {\n";
+foreach my $c_src_name (@good_src_files) {
+    my ($langtag, $langtag_lc, $langtag_uc);
+    $c_src_name =~ m/^(.*)_orth.c$/;
+    $langtag_lc=lc($1);
+    $langtag_uc=uc($1);
+    print OUTF "	\"${langtag_lc}\", ${langtag_uc}_LANG_ORTH_SZ, ${langtag_lc}_lang_orth_chars,\n";
+}
+print OUTF "};\n";
+
+print OUTF<<EOF;
+
+const struct fc_lang_rec* get_fc_lang_data() {
+	return &fc_lang_data[0];
+}
+
+unsigned int get_fc_lang_data_size() {
+	return FC_LANG_DATA_SZ;
+}
+
+const struct fc_lang_rec* fc_lang_find(const char* lang_code) {
+	const struct fc_lang_rec* lang_ptr = fc_lang_data;
+	int i;
+	int found = 0;
+	for (i = 0; i < FC_LANG_DATA_SZ; i++) {
+		if (strcmp(lang_ptr->lang_code, lang_code) == 0)
+		{
+			found = 1;
+			break;
+		}
+	}
+	if (found)
+		return lang_ptr;
+	return 0;
+}
+
+EOF
+
+close(OUTF);
+
+1;
+
+# functions
+
+sub processFile($$$) {
+    my ($langtag, $orth_file, $c_src_name) = @_;
+    #print "langtag=${langtag}; orth_file=$orth_file; c_src_name=$c_src_name\n";
+    my $count = 0;
+
+    my ($fin, $fout);
+    if (!open($fin, "< $orth_file")) {
+        print STDERR "Can't open file \"${orth_file}\" for reading!\n";
+        return undef;
+    }
+    if (!open($fout, "> $c_src_name")) {
+        print STDERR "Can't open file \"${$c_src_name}\" for reading!\n";
+        close($fin);
+        return undef;
+    }
+
+    my $dirname = ".";
+    if ($orth_file =~ m/^(.*)\/[a-zA-Z_0-9]+\.orth$/) {
+        $dirname = $1;
+    }
+
+    print "processing orth-file for language tag \"${langtag}\"\n";
+    my $langtag_uc = uc($langtag);
+    my $langtag_lc = lc($langtag);
+
+    print $fout <<EOF;
+
+// This file is autogenerated from fc-lang database.
+// https://www.freedesktop.org/wiki/Software/fontconfig/
+// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
+// by convert utility from https://github.com/virxkane/freetype_textdraw
+
+const unsigned int ${langtag_lc}_lang_orth_chars[] = {
+EOF
+    $count = parseFile($dirname, $fin, $fout);
+    print $fout "};\n";
+    print $fout "#define ${langtag_uc}_LANG_ORTH_SZ	$count\n";
+
+    close($fout);
+    close($fin);
+    return $count > 0 ? 1 : undef;
+}
+
+
+sub parseFile($$$) {
+    my ($dirname, $fin, $fout) = @_;
+
+    my $count = 0;
+    my $ok;
+    my $line = 0;
+    my ($first, $second);
+
+    my @lines = <$fin>;
+
+    my @lines_=();
+    foreach $line (@lines) {
+        chomp($line);
+        if (length($line) > 0) {
+            # skip leading and/or trailing spaces
+            $line =~ s/^\s*(\S*)\s*$/$1/;
+            # skip comment
+            if ($line =~ m/^#.*$/) {
+                next;
+            }
+            # skip trailing comment
+            $line =~ s/^(\S*)\s*#.*$/$1/;
+            if (length($line) > 0) {
+                push @lines_, $line;
+            }
+        }
+    }
+
+    # Combine 2 lines into one
+    # See file mni.orth
+    # 1: 0964
+    # 2: - 09c4
+    @lines=();
+    foreach $line (@lines_) {
+        if ($line =~ m/^\s*-\s*[0-9a-fA-F]+.*$/) {
+            if (scalar(@lines) > 0) {
+                my $prev = pop @lines;
+                $line = $prev . $line;
+                push @lines, $line;
+            }
+        } else {
+            push @lines, $line;
+        }
+    }
+
+    foreach $line (@lines) {
+        if (length($line) > 0) {
+            if ($line =~ m/^include\s+(.*)$/) {
+                $ok = undef;			# reset flag before line parsing
+                my $incFileName = "${dirname}/$1";
+                if (open(my $newfin, "< $incFileName")) {
+                    print "process included file: \"${incFileName}\"...\n";
+                    my $inc_count = parseFile($dirname, $newfin, $fout);
+                    close($newfin);
+                    if ($inc_count > 0) {
+                        $ok = 1;
+                        $count += $inc_count;
+                    }
+                } else {
+                    print STDERR "Unable to open ${incFileName}\n";
+                }
+            } else {
+                $ok = undef;			# reset flag before line parsing
+                if ($line =~ m/^(0x)*([0-9a-fA-F]+)$/) {
+                    # line contains one number
+                    $first = hex($2);
+                    $second = 0;
+                    $ok = 1;
+                } elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s+.*$/) {
+                    # line contains one number
+                    # with comment without symbol '#'
+                    $first = hex($2);
+                    $second = 0;
+                    $ok = 1;
+                } elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*-\s*(0x)*([0-9a-fA-F]+)$/) {
+                    # line contains range
+                    $first = hex($2);
+                    $second = hex($4);
+                    $ok = 1;
+                } elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*-\s*(0x)*([0-9a-fA-F]+)\s+.*$/) {
+                    # line contains range
+                    # with comment without symbol '#'
+                    $first = hex($2);
+                    $second = hex($4);
+                    $ok = 1;
+                } elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*\.\.\s*(0x)*([0-9a-fA-F]+)$/) {
+                    # line contains range
+                    $first = hex($2);
+                    $second = hex($4);
+                    $ok = 1;
+                } elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*\.\.\s*(0x)*([0-9a-fA-F]+)\s+.*$/) {
+                    # line contains range
+                    # with comment without symbol '#'
+                    $first = hex($2);
+                    $second = hex($4);
+                    $ok = 1;
+                } else {
+                    # just comment without symbol '#'
+                }
+                if ($ok) {
+                    if (0 == $second) {
+                        printf $fout ("\t0x%04x,\n", $first);
+                        $count++;
+                    } else {
+                        printf $fout ("\t0x%08x, 0x%04x, 0x%04x,	// range\n", CODE_IN_RANGE, $first, $second);
+                        $count += 3;
+                    }
+                }
+            }
+            if (!$ok) {
+                print STDERR "Failed to parse line: ${line}\n";
+            }
+        }
+    }
+    return $count;
+}