Blame - fs/unicode/utf8-selftest.c - linux-5.10

blob: 80752013fce07e2e7ff770903d275ee43d2576c6 [file] [log] [blame]

Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	1	/*
				2	* Kernel module for testing utf-8 support.
				3	*
				4	* Copyright 2017 Collabora Ltd.
				5	*
				6	* This software is licensed under the terms of the GNU General Public
				7	* License version 2, as published by the Free Software Foundation, and
				8	* may be copied, distributed, and modified under those terms.
				9	*
				10	* This program is distributed in the hope that it will be useful,
				11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				13	* GNU General Public License for more details.
				14	*/
				15
				16	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				17
				18	#include <linux/module.h>
				19	#include <linux/printk.h>
				20	#include <linux/unicode.h>
				21	#include <linux/dcache.h>
				22
				23	#include "utf8n.h"
				24
				25	unsigned int failed_tests;
				26	unsigned int total_tests;
				27
				28	/* Tests will be based on this version. */
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	29	#define latest_maj 12
				30	#define latest_min 1
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	31	#define latest_rev 0
				32
				33	#define _test(cond, func, line, fmt, ...) do { \
				34	total_tests++; \
				35	if (!cond) { \
				36	failed_tests++; \
				37	pr_err("test %s:%d Failed: %s%s", \
				38	func, line, #cond, (fmt?":":".")); \
				39	if (fmt) \
				40	pr_err(fmt, ##__VA_ARGS__); \
				41	} \
				42	} while (0)
				43	#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
				44	#define test(cond) _test(cond, __func__, __LINE__, "")
				45
				46	const static struct {
				47	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
				48	unsigned char str[10];
				49	unsigned char dec[10];
				50	} nfdi_test_data[] = {
				51	/* Trivial sequence */
				52	{
				53	/* "ABba" decomposes to itself */
				54	.str = "aBba",
				55	.dec = "aBba",
				56	},
				57	/* Simple equivalent sequences */
				58	{
				59	/* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
				60	'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
				61	canonical decomposition */
				62	.str = {0xc2, 0xbc, 0x00},
				63	.dec = {0xc2, 0xbc, 0x00},
				64	},
				65	{
				66	/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
				67	'LETTER A' + 'COMBINING DIAERESIS' */
				68	.str = {0xc3, 0xa4, 0x00},
				69	.dec = {0x61, 0xcc, 0x88, 0x00},
				70	},
				71	{
				72	/* 'LATIN SMALL LETTER LJ' can't decompose to
				73	'LETTER L' + 'LETTER J' on canonical decomposition */
				74	.str = {0xC7, 0x89, 0x00},
				75	.dec = {0xC7, 0x89, 0x00},
				76	},
				77	{
				78	/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
				79	.str = {0xCE, 0x87, 0x00},
				80	.dec = {0xC2, 0xB7, 0x00}
				81	},
				82	/* Canonical ordering */
				83	{
				84	/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
				85	to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
				86	.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
				87	.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
				88	},
				89	{
				90	/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
				91	decomposes to
				92	'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
				93	.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
				94
				95	.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
				96	},
				97
				98	};
				99
				100	const static struct {
				101	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
				102	unsigned char str[30];
				103	unsigned char ncf[30];
				104	} nfdicf_test_data[] = {
				105	/* Trivial sequences */
				106	{
				107	/* "ABba" folds to lowercase */
				108	.str = {0x41, 0x42, 0x62, 0x61, 0x00},
				109	.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
				110	},
				111	{
				112	/* All ASCII folds to lower-case */
				113	.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
				114	.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
				115	},
				116	{
				117	/* LATIN SMALL LETTER SHARP S folds to
				118	LATIN SMALL LETTER S + LATIN SMALL LETTER S */
				119	.str = {0xc3, 0x9f, 0x00},
				120	.ncf = {0x73, 0x73, 0x00},
				121	},
				122	{
				123	/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
				124	LATIN SMALL LETTER A + COMBINING RING ABOVE */
				125	.str = {0xC3, 0x85, 0x00},
				126	.ncf = {0x61, 0xcc, 0x8a, 0x00},
				127	},
				128	/* Introduced by UTF-8.0.0. */
				129	/* Cherokee letters are interesting test-cases because they fold
				130	to upper-case. Before 8.0.0, Cherokee lowercase were
				131	undefined, thus, the folding from LC is not stable between
				132	7.0.0 -> 8.0.0, but it is from UC. */
				133	{
				134	/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
				135	.str = {0xea, 0xad, 0xb0, 0x00},
				136	.ncf = {0xe1, 0x8e, 0xa0, 0x00},
				137	},
				138	{
				139	/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
				140	.str = {0xe1, 0x8f, 0xb8, 0x00},
				141	.ncf = {0xe1, 0x8f, 0xb0, 0x00},
				142	},
				143	{
				144	/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
				145	OLD HUNGARIAN SMALL LETTER AMB */
				146	.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
				147	.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
				148	},
				149	/* Introduced by UTF-9.0.0. */
				150	{
				151	/* OSAGE CAPITAL LETTER CHA folds to
				152	OSAGE SMALL LETTER CHA */
				153	.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
				154	.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
				155	},
				156	{
				157	/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
				158	LATIN LETTER SMALL CAPITAL I */
				159	.str = {0xea, 0x9e, 0xae, 0x00},
				160	.ncf = {0xc9, 0xaa, 0x00},
				161	},
				162	/* Introduced by UTF-11.0.0. */
				163	{
				164	/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
				165	CAPITAL LETTER AN */
				166	.str = {0xe1, 0xb2, 0x90, 0x00},
				167	.ncf = {0xe1, 0x83, 0x90, 0x00},
				168	}
				169	};
				170
				171	static void check_utf8_nfdi(void)
				172	{
				173	int i;
				174	struct utf8cursor u8c;
				175	const struct utf8data *data;
				176
				177	data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
				178	if (!data) {
				179	pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
				180	__func__, latest_maj, latest_min, latest_rev);
				181	return;
				182	}
				183
				184	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
				185	int len = strlen(nfdi_test_data[i].str);
				186	int nlen = strlen(nfdi_test_data[i].dec);
				187	int j = 0;
				188	unsigned char c;
				189
				190	test((utf8len(data, nfdi_test_data[i].str) == nlen));
				191	test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
				192
				193	if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
				194	pr_err("can't create cursor\n");
				195
				196	while ((c = utf8byte(&u8c)) > 0) {
				197	test_f((c == nfdi_test_data[i].dec[j]),
				198	"Unexpected byte 0x%x should be 0x%x\n",
				199	c, nfdi_test_data[i].dec[j]);
				200	j++;
				201	}
				202
				203	test((j == nlen));
				204	}
				205	}
				206
				207	static void check_utf8_nfdicf(void)
				208	{
				209	int i;
				210	struct utf8cursor u8c;
				211	const struct utf8data *data;
				212
				213	data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
				214	if (!data) {
				215	pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
				216	__func__, latest_maj, latest_min, latest_rev);
				217	return;
				218	}
				219
				220	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
				221	int len = strlen(nfdicf_test_data[i].str);
				222	int nlen = strlen(nfdicf_test_data[i].ncf);
				223	int j = 0;
				224	unsigned char c;
				225
				226	test((utf8len(data, nfdicf_test_data[i].str) == nlen));
				227	test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
				228
				229	if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
				230	pr_err("can't create cursor\n");
				231
				232	while ((c = utf8byte(&u8c)) > 0) {
				233	test_f((c == nfdicf_test_data[i].ncf[j]),
				234	"Unexpected byte 0x%x should be 0x%x\n",
				235	c, nfdicf_test_data[i].ncf[j]);
				236	j++;
				237	}
				238
				239	test((j == nlen));
				240	}
				241	}
				242
				243	static void check_utf8_comparisons(void)
				244	{
				245	int i;
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	246	struct unicode_map *table = utf8_load("12.1.0");
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	247
				248	if (IS_ERR(table)) {
				249	pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
				250	__func__, latest_maj, latest_min, latest_rev);
				251	return;
				252	}
				253
				254	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
				255	const struct qstr s1 = {.name = nfdi_test_data[i].str,
				256	.len = sizeof(nfdi_test_data[i].str)};
				257	const struct qstr s2 = {.name = nfdi_test_data[i].dec,
				258	.len = sizeof(nfdi_test_data[i].dec)};
				259
				260	test_f(!utf8_strncmp(table, &s1, &s2),
				261	"%s %s comparison mismatch\n", s1.name, s2.name);
				262	}
				263
				264	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
				265	const struct qstr s1 = {.name = nfdicf_test_data[i].str,
				266	.len = sizeof(nfdicf_test_data[i].str)};
				267	const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
				268	.len = sizeof(nfdicf_test_data[i].ncf)};
				269
				270	test_f(!utf8_strncasecmp(table, &s1, &s2),
				271	"%s %s comparison mismatch\n", s1.name, s2.name);
				272	}
				273
				274	utf8_unload(table);
				275	}
				276
				277	static void check_supported_versions(void)
				278	{
				279	/* Unicode 7.0.0 should be supported. */
				280	test(utf8version_is_supported(7, 0, 0));
				281
				282	/* Unicode 9.0.0 should be supported. */
				283	test(utf8version_is_supported(9, 0, 0));
				284
				285	/* Unicode 1x.0.0 (the latest version) should be supported. */
				286	test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
				287
				288	/* Next versions don't exist. */
Gabriel Krisman Bertazi	1215d23	2019-04-25 13:59:17 -0400	[diff] [blame]	289	test(!utf8version_is_supported(13, 0, 0));
Gabriel Krisman Bertazi	f0d6cc0	2019-04-25 13:56:01 -0400	[diff] [blame]	290	test(!utf8version_is_supported(0, 0, 0));
				291	test(!utf8version_is_supported(-1, -1, -1));
				292	}
				293
				294	static int __init init_test_ucd(void)
				295	{
				296	failed_tests = 0;
				297	total_tests = 0;
				298
				299	check_supported_versions();
				300	check_utf8_nfdi();
				301	check_utf8_nfdicf();
				302	check_utf8_comparisons();
				303
				304	if (!failed_tests)
				305	pr_info("All %u tests passed\n", total_tests);
				306	else
				307	pr_err("%u out of %u tests failed\n", failed_tests,
				308	total_tests);
				309	return 0;
				310	}
				311
				312	static void __exit exit_test_ucd(void)
				313	{
				314	}
				315
				316	module_init(init_test_ucd);
				317	module_exit(exit_test_ucd);
				318
				319	MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
				320	MODULE_LICENSE("GPL");