Skip to content

Commit

Permalink
Create the IICore CJK subset font
Browse files Browse the repository at this point in the history
  • Loading branch information
Satish B committed Dec 12, 2021
1 parent a7f22fe commit a2f281b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 51 deletions.
31 changes: 20 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,15 @@ Fonts are merged/combined as per the regions defined in the [Unicode Standard
(pdf)](https://www.unicode.org/versions/Unicode14.0.0/UnicodeStandard-14.0.pdf). Chapter numbers
refer to that spec.

| Regional font | Coverage |
|----------------------------|--------------------------------------------------------------------------------------|
| GoNotoEuropeAmericas.ttf | "Europe" - ch. 7, 8 and "Americas" - ch 20 |
| GoNotoAfricaMiddleEast.ttf | "Middle East" - ch. 9, 10, 11 and "Africa" - ch. 19 |
| GoNotoSouthAsia.ttf | "South and Central Asia" - ch. 12 and 13 |
| GoNotoAsiaHistorical.ttf | "South and Central Asia" - ch. 14 and 15 |
| GoNotoSouthEastAsia.ttf | "Southeast Asia" - ch. 16 and "Indonesia and Ocenia" - ch 17 |
| GoNotoEastAsia.ttf | "East Asia" - ch 18. everything other than Han (CJK) |
| GoNotoCJK.ttf | [Noto CJK](https://github.com/googlefonts/noto-cjk/blob/main/Sans/README-formats.md) |

| Regional font | Coverage |
|----------------------------|-------------------------------------------------------------------------------------------|
| GoNotoEuropeAmericas.ttf | "Europe" - ch. 7, 8 and "Americas" - ch 20 |
| GoNotoAfricaMiddleEast.ttf | "Middle East" - ch. 9, 10, 11 and "Africa" - ch. 19 |
| GoNotoSouthAsia.ttf | "South and Central Asia" - ch. 12 and 13 |
| GoNotoAsiaHistorical.ttf | "South and Central Asia" - ch. 14 and 15 |
| GoNotoSouthEastAsia.ttf | "Southeast Asia" - ch. 16 and "Indonesia and Ocenia" - ch 17 |
| GoNotoEastAsia.ttf | "East Asia" - ch 18. everything other than Han (CJK) |
| GoNotoCJKCore2003.ttf | [Unicode IICore][1] subset of CJK (~10K ideographs). See [Noto CJK][2] for full coverage |

Each of the above fonts includes LGC (Latin-Greek-Cyrillic) as default, same coverage as `Noto Sans
Regular`. Each one also includes Noto Sans Math, Noto Sans Symbols and Noto Sans Symbols 2 to give
Expand Down Expand Up @@ -112,8 +111,15 @@ Tibetan, Lisu, Marchen, Miao, Yi, etc. excluding Han/CJK (Chinese-Japanese-Korea

Mongolian, Nushu and Tangut could not be included.

## Font Statistics
### Go Noto CJK Core 2003

[Unicode IICore][1] is a minimal subset of CJK specified in 2003 for memory-constrained systems. It
standardizes about 9800 codepoints. The generated font has about 20000 glyphs.

Recently [Unihan Core 2020](https://unicode.org/charts/unihan.html) upgrades the minimal subset to
about 20000 codepoints.

## Font Statistics

| Regional font | Code blocks | Codepoints | Glyphs |
|----------------------------|-------------|------------|--------|
Expand Down Expand Up @@ -205,3 +211,6 @@ DA > BA > TA > PA > TSA
--unicodes=U+0F00-0F8C,U+0FAD,U+0FB1,U+0FB2,U+0FBE-0FDA
Basic minimal set 168 codepoints, 839 glyphs, 234KB, 541 GSUB
-->

[1]: https://en.wikipedia.org/wiki/International_Ideographs_Core
[2]: https://github.com/googlefonts/noto-cjk/
108 changes: 68 additions & 40 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,49 @@

[[ -z "$VIRTUAL_ENV" ]] && echo "Refusing to run outside of venv. See README.md." && exit 1

python3 -m pip install fonttools
main() {
python3 -m pip install fonttools

if [[ ! -d nototools ]]; then
git clone --depth 1 https://github.com/googlefonts/nototools
else
echo "Re-using existing clone of nototools."
fi
if [[ ! -d nototools ]]; then
git clone --depth 1 https://github.com/googlefonts/nototools
else
echo "Re-using existing clone of nototools."
fi

# Patch merge_fonts.py to be callable by external script
cd nototools/
if ! git apply --reverse --check ../merge_fonts.patch 2> /dev/null; then
echo "applying patch."
git apply ../merge_fonts.patch
else
echo "patch already applied."
fi
cd "$OLDPWD"

# Patch merge_fonts.py to be callable by external script
cd nototools/
if ! git apply --reverse --check ../merge_fonts.patch 2> /dev/null; then
echo "applying patch"
git apply ../merge_fonts.patch
else
echo "patch already applied"
fi
cd "$OLDPWD"
subset_tibetan

declare -a fonts=(
GoNotoAfricaMiddleEast.ttf
GoNotoSouthAsia.ttf
GoNotoAsiaHistorical.ttf
GoNotoSouthEastAsia.ttf
GoNotoEastAsia.ttf
GoNotoEuropeAmericas.ttf
)

for font in "${fonts[@]}"; do
if [[ -e "$font" ]]; then
echo "Not overwriting existing font $font."
continue
fi
echo "Generating font $font. Current time: $(date).\n"
mkdir -p cached_fonts
time PYTHONPATH="nototools/nototools" python3 generate.py -o "$font" -d cached_fonts
edit_font_info "$font"
done

create_cjk_subset
}

# create tibetan subset so that GSUB is not overflow'ed.
subset_tibetan() {
Expand All @@ -28,46 +54,48 @@ subset_tibetan() {
wget https://github.com/googlefonts/noto-fonts/raw/main/hinted/ttf/NotoSerifTibetan/NotoSerifTibetan-Regular.ttf
fi
echo "Creating a smaller subset of Tibetan glyphs..."
$VIRTUAL_ENV/bin/pyftsubset NotoSerifTibetan-Regular.ttf --output-file=NotoSerifTibetanSubset-Regular.ttf \
"$VIRTUAL_ENV"/bin/pyftsubset NotoSerifTibetan-Regular.ttf --output-file=NotoSerifTibetanSubset-Regular.ttf \
--unicodes=U+0F00-0F8C,U+0F90,U+0F92,U+0F94,U+0F99,U+0F9F,U+0FA4,U+0FA9,U+0FAD,U+0FB1-0FB3,U+0FBA-0FDA
fi
cd "$OLDPWD"
}

subset_tibetan

# Rename "Noto Sans" to "Go Noto Whatever"
edit_font_info() {
local fontname="$1"
local without_spaces="${fontname%%.*}"
local with_spaces=$(echo "$without_spaces" | sed -E 's/([a-z])([A-Z])/\1 \2/g')
local xml_file="$without_spaces".ttx
local xml_file_bak="$xml_file".bak
echo "Editing font metadata for $fontname"
$VIRTUAL_ENV/bin/ttx -o "$xml_file" "$fontname" 2> /dev/null
[[ $? -ne 0 ]] && echo "ERROR: Could not dump $fontname to xml" && return 1
echo "Editing font metadata for $fontname..."
"$VIRTUAL_ENV"/bin/ttx -o "$xml_file" "$fontname" 2> /dev/null
[[ $? -ne 0 ]] && echo "ERROR: Could not dump $fontname to xml." && return 1
sed -e "s/Noto Sans/$with_spaces/g" -e "s/NotoSans/$without_spaces/g" "$xml_file" > "$xml_file_bak"
mv "$xml_file_bak" "$xml_file"
$VIRTUAL_ENV/bin/ttx -o "$fontname" "$xml_file" 2> /dev/null
[[ $? -ne 0 ]] && echo "ERROR: Could not dump xml to $fontname" && return 2
"$VIRTUAL_ENV"/bin/ttx -o "$fontname" "$xml_file" 2> /dev/null
[[ $? -ne 0 ]] && echo "ERROR: Could not dump xml to $fontname." && return 2
rm -f "$xml_file"
}

declare -a fonts=(
GoNotoAfricaMiddleEast.ttf
GoNotoSouthAsia.ttf
GoNotoAsiaHistorical.ttf
GoNotoSouthEastAsia.ttf
GoNotoEastAsia.ttf
GoNotoEuropeAmericas.ttf
)
# Unicode IICore 2003 is a small subsetof CJK (~10k codepoints).
# Recently is has been superceded by UnihanCore2020.
create_cjk_subset() {
local fontname=GoNotoCJKCore2003.otf

for font in "${fonts[@]}"; do
if [[ -e "$font" ]]; then
echo "Not overwriting existing font $font"
continue
cd cached_fonts/
[[ ! -e Unihan.zip ]] && wget https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip
python3 -m zipfile -e Unihan.zip .
grep kIICore Unihan_IRGSources.txt | cut -f1 > unicode_points.txt
if [[ ! -e NotoSansCJKsc-Regular.otf ]]; then
wget https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTF/SimplifiedChinese/NotoSansCJKsc-Regular.otf
fi
printf "Generating font $font. Current time: $(date)\n"
mkdir -p cached_fonts
time PYTHONPATH="nototools/nototools" python3 generate.py -o "$font" -d cached_fonts
edit_font_info "$font"
done
cd "$OLDPWD"
echo "Generating font $fontname."
"$VIRTUAL_ENV"/bin/pyftsubset cached_fonts/NotoSansCJKsc-Regular.otf \
--unicodes-file=cached_fonts/unicode_points.txt \
--output-file="$fontname"
edit_font_info "$fontname"
}

# execution starts here
main

0 comments on commit a2f281b

Please sign in to comment.