Skip to content

Commit

Permalink
Convertion utility of the database of language orthographies rewriten…
Browse files Browse the repository at this point in the history
… in perl.
  • Loading branch information
virxkane committed Jul 7, 2021
1 parent a779a56 commit aeb8741
Show file tree
Hide file tree
Showing 262 changed files with 690 additions and 1,114 deletions.
23 changes: 2 additions & 21 deletions fc-lang_conv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,8 @@
# Find includes in corresponding build directories
set(CMAKE_INCLUDE_CURRENT_DIR ON)

set(EXE_NAME fc-lang_conv)

set(SRC_LIST
main.c
fc-lang-data.c
)

if(WIN32)
set(SRC_LIST ${SRC_LIST}
getline_win32.c
)
endif(WIN32)

set(LDADD_LIBS)
if(WIN32)
# -mconsole -> console application (with terminal screen)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mconsole")
endif(WIN32)

add_executable(${EXE_NAME} WIN32 ${SRC_LIST})
target_link_libraries(${EXE_NAME} ${LDADD_LIBS})

add_library(fc-lang-cat STATIC fc-lang-cat.c)

configure_file(update.sh.cmake ${CMAKE_CURRENT_BINARY_DIR}/update.sh @ONLY)
add_library(fc-lang-data STATIC ${SRC_LIST})
323 changes: 323 additions & 0 deletions fc-lang_conv/conv.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
#!/usr/bin/perl

$fc_lang_dir="../fc-lang";
$fc_lang_conv_dir="files";

$cat_srcfile="fc-lang-data.c";
$cat_hdrfile="fc-lang-data.h";

# By defifnition this is invalid code point.
use constant CODE_IN_RANGE => 0xF0F0FFFF;

sub processFile($$$);
sub parseFile($$$);

@orth_files=();
if (opendir(my $dirh, ${fc_lang_dir})) {
my $fname = readdir($dirh);
while (defined($fname)) {
if ($fname =~ m/^.*\.orth$/) {
push @orth_files, $fname;
}
$fname = readdir($dirh);
}
closedir($dirh);
@orth_files = sort(@orth_files);
} else {
die "Failed to open directory!\n";
}

$count=0;
@good_src_files=();
foreach my $fname (@orth_files) {
if ($fname =~ m/^(.*)\.orth$/) {
my $langtag=$1;
my $c_src_name = "${langtag}_orth.c";
if (processFile($langtag, "${fc_lang_dir}/${fname}", "${fc_lang_conv_dir}/${c_src_name}")) {
push @good_src_files, ${c_src_name};
$count++;
}
}
}

# Create catalog header file
if (!open(OUTF, "> $cat_hdrfile")) {
die "Unable to open $cat_hdrfile";
}
print OUTF<<EOF;
// FontConfig database of language orthographies.
// License: Public Domain.
// This file is autogenerated from fc-lang database.
// https://www.freedesktop.org/wiki/Software/fontconfig/
// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
// by convert utility from https://github.com/virxkane/freetype_textdraw
#ifndef FC_LANG_DATA_H
#define FC_LANG_DATA_H
#ifdef __cplusplus
extern "C" {
#endif
#define FC_LANG_DATA_SZ $count
struct fc_lang_rec
{
const char* lang_code;
const unsigned int char_set_sz;
const unsigned int* char_set;
};
/**
* \@brief Return pointer to FontConfig database of language orthographies
* \@return array of fc_lang_rec records.
*/
const struct fc_lang_rec* get_fc_lang_data();
/**
* \@brief Get count of records in the FontConfig database of language orthographies.
* \@return Count of records in array.
*/
unsigned int get_fc_lang_data_size();
/**
* \@brief Find language in database by code
* \@param lang_code language code is exactly as it appears in the fc_lang catalog.
* \@return Pointer to fc_lang_rec instance if language found, NULL otherwise.
*/
const struct fc_lang_rec* fc_lang_find(const char* lang_code);
#ifdef __cplusplus
}
#endif
#endif // FC_LANG_DATA_H
EOF
close(OUTF);

# Create catalog source file
if (!open(OUTF, "> $cat_srcfile")) {
die "Unable to open $cat_srcfile";
}
print OUTF<<EOF;
// FontConfig database of language orthographies.
// License: Public Domain.
// This file is autogenerated from fc-lang database.
// https://www.freedesktop.org/wiki/Software/fontconfig/
// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
// by convert utility from https://github.com/virxkane/freetype_textdraw
#include <string.h>
#include "$cat_hdrfile"
EOF

foreach my $c_src_name (@good_src_files) {
print OUTF "#include \"${fc_lang_conv_dir}/${c_src_name}\"\n";
}
print OUTF "\n";
print OUTF "static const struct fc_lang_rec fc_lang_data[] = {\n";
foreach my $c_src_name (@good_src_files) {
my ($langtag, $langtag_lc, $langtag_uc);
$c_src_name =~ m/^(.*)_orth.c$/;
$langtag_lc=lc($1);
$langtag_uc=uc($1);
print OUTF " \"${langtag_lc}\", ${langtag_uc}_LANG_ORTH_SZ, ${langtag_lc}_lang_orth_chars,\n";
}
print OUTF "};\n";

print OUTF<<EOF;
const struct fc_lang_rec* get_fc_lang_data() {
return &fc_lang_data[0];
}
unsigned int get_fc_lang_data_size() {
return FC_LANG_DATA_SZ;
}
const struct fc_lang_rec* fc_lang_find(const char* lang_code) {
const struct fc_lang_rec* lang_ptr = fc_lang_data;
int i;
int found = 0;
for (i = 0; i < FC_LANG_DATA_SZ; i++) {
if (strcmp(lang_ptr->lang_code, lang_code) == 0)
{
found = 1;
break;
}
}
if (found)
return lang_ptr;
return 0;
}
EOF

close(OUTF);

1;

# functions

sub processFile($$$) {
my ($langtag, $orth_file, $c_src_name) = @_;
#print "langtag=${langtag}; orth_file=$orth_file; c_src_name=$c_src_name\n";
my $count = 0;

my ($fin, $fout);
if (!open($fin, "< $orth_file")) {
print STDERR "Can't open file \"${orth_file}\" for reading!\n";
return undef;
}
if (!open($fout, "> $c_src_name")) {
print STDERR "Can't open file \"${$c_src_name}\" for reading!\n";
close($fin);
return undef;
}

my $dirname = ".";
if ($orth_file =~ m/^(.*)\/[a-zA-Z_0-9]+\.orth$/) {
$dirname = $1;
}

print "processing orth-file for language tag \"${langtag}\"\n";
my $langtag_uc = uc($langtag);
my $langtag_lc = lc($langtag);

print $fout <<EOF;
// This file is autogenerated from fc-lang database.
// https://www.freedesktop.org/wiki/Software/fontconfig/
// https://gitlab.freedesktop.org/fontconfig/fontconfig/tree/master/fc-lang
// by convert utility from https://github.com/virxkane/freetype_textdraw
const unsigned int ${langtag_lc}_lang_orth_chars[] = {
EOF
$count = parseFile($dirname, $fin, $fout);
print $fout "};\n";
print $fout "#define ${langtag_uc}_LANG_ORTH_SZ $count\n";

close($fout);
close($fin);
return $count > 0 ? 1 : undef;
}


sub parseFile($$$) {
my ($dirname, $fin, $fout) = @_;

my $count = 0;
my $ok;
my $line = 0;
my ($first, $second);

my @lines = <$fin>;

my @lines_=();
foreach $line (@lines) {
chomp($line);
if (length($line) > 0) {
# skip leading and/or trailing spaces
$line =~ s/^\s*(\S*)\s*$/$1/;
# skip comment
if ($line =~ m/^#.*$/) {
next;
}
# skip trailing comment
$line =~ s/^(\S*)\s*#.*$/$1/;
if (length($line) > 0) {
push @lines_, $line;
}
}
}

# Combine 2 lines into one
# See file mni.orth
# 1: 0964
# 2: - 09c4
@lines=();
foreach $line (@lines_) {
if ($line =~ m/^\s*-\s*[0-9a-fA-F]+.*$/) {
if (scalar(@lines) > 0) {
my $prev = pop @lines;
$line = $prev . $line;
push @lines, $line;
}
} else {
push @lines, $line;
}
}

foreach $line (@lines) {
if (length($line) > 0) {
if ($line =~ m/^include\s+(.*)$/) {
$ok = undef; # reset flag before line parsing
my $incFileName = "${dirname}/$1";
if (open(my $newfin, "< $incFileName")) {
print "process included file: \"${incFileName}\"...\n";
my $inc_count = parseFile($dirname, $newfin, $fout);
close($newfin);
if ($inc_count > 0) {
$ok = 1;
$count += $inc_count;
}
} else {
print STDERR "Unable to open ${incFileName}\n";
}
} else {
$ok = undef; # reset flag before line parsing
if ($line =~ m/^(0x)*([0-9a-fA-F]+)$/) {
# line contains one number
$first = hex($2);
$second = 0;
$ok = 1;
} elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s+.*$/) {
# line contains one number
# with comment without symbol '#'
$first = hex($2);
$second = 0;
$ok = 1;
} elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*-\s*(0x)*([0-9a-fA-F]+)$/) {
# line contains range
$first = hex($2);
$second = hex($4);
$ok = 1;
} elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*-\s*(0x)*([0-9a-fA-F]+)\s+.*$/) {
# line contains range
# with comment without symbol '#'
$first = hex($2);
$second = hex($4);
$ok = 1;
} elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*\.\.\s*(0x)*([0-9a-fA-F]+)$/) {
# line contains range
$first = hex($2);
$second = hex($4);
$ok = 1;
} elsif ($line =~ m/^(0x)*([0-9a-fA-F]+)\s*\.\.\s*(0x)*([0-9a-fA-F]+)\s+.*$/) {
# line contains range
# with comment without symbol '#'
$first = hex($2);
$second = hex($4);
$ok = 1;
} else {
# just comment without symbol '#'
}
if ($ok) {
if (0 == $second) {
printf $fout ("\t0x%04x,\n", $first);
$count++;
} else {
printf $fout ("\t0x%08x, 0x%04x, 0x%04x, // range\n", CODE_IN_RANGE, $first, $second);
$count += 3;
}
}
}
if (!$ok) {
print STDERR "Failed to parse line: ${line}\n";
}
}
}
return $count;
}
Loading

0 comments on commit aeb8741

Please sign in to comment.