DB, Collections, Search
This commit is contained in:
250
src/lib/import/csv-parser.test.ts
Normal file
250
src/lib/import/csv-parser.test.ts
Normal file
@@ -0,0 +1,250 @@
|
||||
import { describe, it, expect } from "vitest"
|
||||
import { parseCSV, generateCSVTemplate } from "./csv-parser"
|
||||
|
||||
describe("parseCSV", () => {
|
||||
it("should parse valid CSV with all fields", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning,hsk_level,radical,frequency,pos,classifiers
|
||||
爱好,愛好,ài hào,"to like; hobby","new-1,old-3",爫,4902,"n,v",个`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(1)
|
||||
expect(result.failed).toBe(0)
|
||||
expect(data).toHaveLength(1)
|
||||
expect(data[0].simplified).toBe("爱好")
|
||||
expect(data[0].radical).toBe("爫")
|
||||
expect(data[0].frequency).toBe(4902)
|
||||
expect(data[0].hskLevels).toEqual(["new-1", "old-3"])
|
||||
expect(data[0].partsOfSpeech).toEqual(["n", "v"])
|
||||
expect(data[0].forms).toHaveLength(1)
|
||||
expect(data[0].forms[0].traditional).toBe("愛好")
|
||||
expect(data[0].forms[0].classifiers).toEqual(["个"])
|
||||
})
|
||||
|
||||
it("should parse CSV with only required fields", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(1)
|
||||
expect(data[0].simplified).toBe("好")
|
||||
expect(data[0].radical).toBeUndefined()
|
||||
expect(data[0].frequency).toBeUndefined()
|
||||
expect(data[0].hskLevels).toEqual([])
|
||||
expect(data[0].partsOfSpeech).toEqual([])
|
||||
})
|
||||
|
||||
it("should parse multiple rows", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good
|
||||
爱,愛,ài,love
|
||||
你,你,nǐ,you`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(3)
|
||||
expect(data).toHaveLength(3)
|
||||
})
|
||||
|
||||
it("should handle quoted values with commas", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,"good, fine, nice"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings[0].meaning).toBe("good, fine, nice")
|
||||
})
|
||||
|
||||
it("should handle quoted values with semicolons (multiple meanings)", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,"good; fine; nice"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings).toHaveLength(3)
|
||||
expect(data[0].forms[0].meanings[0].meaning).toBe("good")
|
||||
expect(data[0].forms[0].meanings[1].meaning).toBe("fine")
|
||||
expect(data[0].forms[0].meanings[2].meaning).toBe("nice")
|
||||
})
|
||||
|
||||
it("should handle escaped quotes in values", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,"He said ""good"""`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings[0].meaning).toBe('He said "good"')
|
||||
})
|
||||
|
||||
it("should skip empty lines", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good
|
||||
|
||||
爱,愛,ài,love
|
||||
|
||||
你,你,nǐ,you`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(3)
|
||||
})
|
||||
|
||||
it("should parse comma-separated HSK levels", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning,hsk_level
|
||||
好,好,hǎo,good,"new-1,old-2,old-3"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].hskLevels).toEqual(["new-1", "old-2", "old-3"])
|
||||
})
|
||||
|
||||
it("should parse comma-separated parts of speech", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning,hsk_level,radical,frequency,pos
|
||||
好,好,hǎo,good,,,,"adj,v,n"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].partsOfSpeech).toEqual(["adj", "v", "n"])
|
||||
})
|
||||
|
||||
it("should parse comma-separated classifiers", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning,hsk_level,radical,frequency,pos,classifiers
|
||||
好,好,hǎo,good,,,,,"个,只,条"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].classifiers).toEqual(["个", "只", "条"])
|
||||
})
|
||||
|
||||
it("should parse frequency as number", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning,hsk_level,radical,frequency
|
||||
好,好,hǎo,good,,,1234`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].frequency).toBe(1234)
|
||||
})
|
||||
|
||||
it("should return error for empty CSV", () => {
|
||||
const csv = ""
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
expect(result.errors[0].error).toContain("Invalid CSV headers")
|
||||
})
|
||||
|
||||
it("should return error for invalid headers", () => {
|
||||
const csv = `wrong,headers
|
||||
好,好`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
expect(result.errors[0].error).toContain("Invalid CSV headers")
|
||||
})
|
||||
|
||||
it("should return error for missing required fields", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,,good`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.failed).toBe(1)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
})
|
||||
|
||||
it("should continue parsing after errors", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good
|
||||
爱,愛,,love
|
||||
你,你,nǐ,you`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.imported).toBe(2)
|
||||
expect(result.failed).toBe(1)
|
||||
expect(data).toHaveLength(2)
|
||||
})
|
||||
|
||||
it("should set first form as default", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].isDefault).toBe(true)
|
||||
})
|
||||
|
||||
it("should create pinyin transcription", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].transcriptions).toHaveLength(1)
|
||||
expect(data[0].forms[0].transcriptions[0].type).toBe("pinyin")
|
||||
expect(data[0].forms[0].transcriptions[0].value).toBe("hǎo")
|
||||
})
|
||||
|
||||
it("should set language code to English", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,good`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings[0].languageCode).toBe("en")
|
||||
})
|
||||
|
||||
it("should assign order indices to meanings", () => {
|
||||
const csv = `simplified,traditional,pinyin,meaning
|
||||
好,好,hǎo,"good; fine; nice"`
|
||||
|
||||
const { result, data } = parseCSV(csv)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings[0].orderIndex).toBe(0)
|
||||
expect(data[0].forms[0].meanings[1].orderIndex).toBe(1)
|
||||
expect(data[0].forms[0].meanings[2].orderIndex).toBe(2)
|
||||
})
|
||||
})
|
||||
|
||||
describe("generateCSVTemplate", () => {
|
||||
it("should generate valid CSV template", () => {
|
||||
const template = generateCSVTemplate()
|
||||
|
||||
expect(template).toContain("simplified,traditional,pinyin,meaning")
|
||||
expect(template).toContain("爱好,愛好,ài hào")
|
||||
|
||||
const lines = template.split("\n")
|
||||
expect(lines).toHaveLength(2) // Header + example
|
||||
})
|
||||
|
||||
it("should have parseable template", () => {
|
||||
const template = generateCSVTemplate()
|
||||
|
||||
const { result } = parseCSV(template)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(1)
|
||||
})
|
||||
})
|
||||
249
src/lib/import/csv-parser.ts
Normal file
249
src/lib/import/csv-parser.ts
Normal file
@@ -0,0 +1,249 @@
|
||||
import { z } from "zod"
|
||||
import type {
|
||||
CSVRow,
|
||||
ParsedHanzi,
|
||||
ImportResult,
|
||||
ImportError,
|
||||
} from "./types"
|
||||
|
||||
/**
|
||||
* Zod schema for CSV row validation
|
||||
*/
|
||||
const CSVRowSchema = z.object({
|
||||
simplified: z.string().min(1),
|
||||
traditional: z.string().min(1),
|
||||
pinyin: z.string().min(1),
|
||||
meaning: z.string().min(1),
|
||||
hsk_level: z.string().optional(),
|
||||
radical: z.string().optional(),
|
||||
frequency: z.string().optional(),
|
||||
pos: z.string().optional(),
|
||||
classifiers: z.string().optional(),
|
||||
})
|
||||
|
||||
/**
|
||||
* Parse CSV format
|
||||
* Expected format:
|
||||
* simplified,traditional,pinyin,meaning,hsk_level,radical,frequency,pos,classifiers
|
||||
*/
|
||||
export function parseCSV(csvString: string): {
|
||||
result: ImportResult
|
||||
data: ParsedHanzi[]
|
||||
} {
|
||||
const errors: ImportError[] = []
|
||||
const parsed: ParsedHanzi[] = []
|
||||
const lines = csvString.trim().split("\n")
|
||||
|
||||
if (lines.length === 0) {
|
||||
return {
|
||||
result: {
|
||||
success: false,
|
||||
imported: 0,
|
||||
failed: 0,
|
||||
errors: [{ error: "Empty CSV file" }],
|
||||
},
|
||||
data: [],
|
||||
}
|
||||
}
|
||||
|
||||
// Parse header
|
||||
const headerLine = lines[0]
|
||||
const headers = parseCSVLine(headerLine)
|
||||
|
||||
if (!validateHeaders(headers)) {
|
||||
return {
|
||||
result: {
|
||||
success: false,
|
||||
imported: 0,
|
||||
failed: 0,
|
||||
errors: [{
|
||||
error: `Invalid CSV headers. Expected at least: simplified,traditional,pinyin,meaning. Got: ${headers.join(",")}`,
|
||||
}],
|
||||
},
|
||||
data: [],
|
||||
}
|
||||
}
|
||||
|
||||
// Parse data rows
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
const line = lines[i].trim()
|
||||
if (!line) continue // Skip empty lines
|
||||
|
||||
try {
|
||||
const values = parseCSVLine(line)
|
||||
const row = parseCSVRow(headers, values)
|
||||
const validationResult = CSVRowSchema.safeParse(row)
|
||||
|
||||
if (!validationResult.success) {
|
||||
throw new Error(
|
||||
validationResult.error.errors
|
||||
.map(e => `${e.path.join(".")}: ${e.message}`)
|
||||
.join(", ")
|
||||
)
|
||||
}
|
||||
|
||||
const parsedEntry = transformCSVRow(validationResult.data)
|
||||
parsed.push(parsedEntry)
|
||||
} catch (error) {
|
||||
const simplified = line.split(",")[0] || "unknown"
|
||||
errors.push({
|
||||
line: i + 1,
|
||||
character: simplified,
|
||||
error: error instanceof Error ? error.message : "Unknown error",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
result: {
|
||||
success: errors.length === 0,
|
||||
imported: parsed.length,
|
||||
failed: errors.length,
|
||||
errors,
|
||||
},
|
||||
data: parsed,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a CSV line handling quoted values
|
||||
*/
|
||||
function parseCSVLine(line: string): string[] {
|
||||
const values: string[] = []
|
||||
let current = ""
|
||||
let inQuotes = false
|
||||
|
||||
for (let i = 0; i < line.length; i++) {
|
||||
const char = line[i]
|
||||
const nextChar = line[i + 1]
|
||||
|
||||
if (char === '"') {
|
||||
if (inQuotes && nextChar === '"') {
|
||||
// Escaped quote
|
||||
current += '"'
|
||||
i++
|
||||
} else {
|
||||
// Toggle quote state
|
||||
inQuotes = !inQuotes
|
||||
}
|
||||
} else if (char === "," && !inQuotes) {
|
||||
// End of field
|
||||
values.push(current.trim())
|
||||
current = ""
|
||||
} else {
|
||||
current += char
|
||||
}
|
||||
}
|
||||
|
||||
// Add last field
|
||||
values.push(current.trim())
|
||||
|
||||
return values
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate CSV headers
|
||||
*/
|
||||
function validateHeaders(headers: string[]): boolean {
|
||||
const required = ["simplified", "traditional", "pinyin", "meaning"]
|
||||
return required.every(h => headers.includes(h))
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert CSV values array to row object
|
||||
*/
|
||||
function parseCSVRow(headers: string[], values: string[]): CSVRow {
|
||||
const row: any = {}
|
||||
headers.forEach((header, index) => {
|
||||
const value = values[index]?.trim()
|
||||
if (value) {
|
||||
row[header] = value
|
||||
}
|
||||
})
|
||||
return row as CSVRow
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform CSV row to ParsedHanzi format
|
||||
*/
|
||||
function transformCSVRow(row: CSVRow): ParsedHanzi {
|
||||
// Parse HSK levels (comma-separated)
|
||||
const hskLevels = row.hsk_level
|
||||
? row.hsk_level.split(",").map(l => l.trim())
|
||||
: []
|
||||
|
||||
// Parse parts of speech (comma-separated)
|
||||
const partsOfSpeech = row.pos
|
||||
? row.pos.split(",").map(p => p.trim())
|
||||
: []
|
||||
|
||||
// Parse frequency
|
||||
const frequency = row.frequency
|
||||
? parseInt(row.frequency, 10)
|
||||
: undefined
|
||||
|
||||
// Parse classifiers (comma-separated)
|
||||
const classifiers = row.classifiers
|
||||
? row.classifiers.split(",").map(c => c.trim())
|
||||
: []
|
||||
|
||||
// Parse meanings (semicolon-separated)
|
||||
const meanings = row.meaning.split(";").map((m, index) => ({
|
||||
languageCode: "en",
|
||||
meaning: m.trim(),
|
||||
orderIndex: index,
|
||||
}))
|
||||
|
||||
return {
|
||||
simplified: row.simplified,
|
||||
radical: row.radical,
|
||||
frequency,
|
||||
hskLevels,
|
||||
partsOfSpeech,
|
||||
forms: [
|
||||
{
|
||||
traditional: row.traditional,
|
||||
isDefault: true,
|
||||
transcriptions: [
|
||||
{
|
||||
type: "pinyin",
|
||||
value: row.pinyin,
|
||||
},
|
||||
],
|
||||
meanings,
|
||||
classifiers,
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate CSV template
|
||||
*/
|
||||
export function generateCSVTemplate(): string {
|
||||
const headers = [
|
||||
"simplified",
|
||||
"traditional",
|
||||
"pinyin",
|
||||
"meaning",
|
||||
"hsk_level",
|
||||
"radical",
|
||||
"frequency",
|
||||
"pos",
|
||||
"classifiers",
|
||||
]
|
||||
|
||||
const example = [
|
||||
"爱好",
|
||||
"愛好",
|
||||
"ài hào",
|
||||
"to like; hobby",
|
||||
"new-1,old-3",
|
||||
"爫",
|
||||
"4902",
|
||||
"n,v",
|
||||
"个",
|
||||
]
|
||||
|
||||
return [headers.join(","), example.join(",")].join("\n")
|
||||
}
|
||||
300
src/lib/import/hsk-json-parser.test.ts
Normal file
300
src/lib/import/hsk-json-parser.test.ts
Normal file
@@ -0,0 +1,300 @@
|
||||
import { describe, it, expect } from "vitest"
|
||||
import { parseHSKJson, validateHSKJsonEntry } from "./hsk-json-parser"
|
||||
|
||||
describe("parseHSKJson", () => {
|
||||
it("should parse valid single JSON entry", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "爱好",
|
||||
radical: "爫",
|
||||
level: ["new-1", "old-3"],
|
||||
frequency: 4902,
|
||||
pos: ["n", "v"],
|
||||
forms: [
|
||||
{
|
||||
traditional: "愛好",
|
||||
transcriptions: {
|
||||
pinyin: "ài hào",
|
||||
numeric: "ai4 hao4",
|
||||
},
|
||||
meanings: ["to like; hobby"],
|
||||
classifiers: ["个"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(1)
|
||||
expect(result.failed).toBe(0)
|
||||
expect(result.errors).toHaveLength(0)
|
||||
expect(data).toHaveLength(1)
|
||||
expect(data[0].simplified).toBe("爱好")
|
||||
expect(data[0].radical).toBe("爫")
|
||||
expect(data[0].frequency).toBe(4902)
|
||||
expect(data[0].hskLevels).toEqual(["new-1", "old-3"])
|
||||
expect(data[0].partsOfSpeech).toEqual(["n", "v"])
|
||||
expect(data[0].forms).toHaveLength(1)
|
||||
expect(data[0].forms[0].traditional).toBe("愛好")
|
||||
expect(data[0].forms[0].isDefault).toBe(true)
|
||||
})
|
||||
|
||||
it("should parse valid JSON array", () => {
|
||||
const json = JSON.stringify([
|
||||
{
|
||||
simplified: "爱",
|
||||
forms: [
|
||||
{
|
||||
traditional: "愛",
|
||||
transcriptions: { pinyin: "ài" },
|
||||
meanings: ["to love"],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(result.imported).toBe(2)
|
||||
expect(data).toHaveLength(2)
|
||||
})
|
||||
|
||||
it("should handle missing optional fields", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].radical).toBeUndefined()
|
||||
expect(data[0].frequency).toBeUndefined()
|
||||
expect(data[0].hskLevels).toEqual([])
|
||||
expect(data[0].partsOfSpeech).toEqual([])
|
||||
})
|
||||
|
||||
it("should split semicolon-separated meanings", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good; fine; nice"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].meanings).toHaveLength(3)
|
||||
expect(data[0].forms[0].meanings[0].meaning).toBe("good")
|
||||
expect(data[0].forms[0].meanings[1].meaning).toBe("fine")
|
||||
expect(data[0].forms[0].meanings[2].meaning).toBe("nice")
|
||||
})
|
||||
|
||||
it("should handle multiple forms with second form not being default", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "爱",
|
||||
forms: [
|
||||
{
|
||||
traditional: "愛",
|
||||
transcriptions: { pinyin: "ài" },
|
||||
meanings: ["to love"],
|
||||
},
|
||||
{
|
||||
traditional: "爱",
|
||||
transcriptions: { pinyin: "ài" },
|
||||
meanings: ["to love (simplified)"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms).toHaveLength(2)
|
||||
expect(data[0].forms[0].isDefault).toBe(true)
|
||||
expect(data[0].forms[1].isDefault).toBe(false)
|
||||
})
|
||||
|
||||
it("should handle multiple transcription types", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: {
|
||||
pinyin: "hǎo",
|
||||
numeric: "hao3",
|
||||
wadegiles: "hao3",
|
||||
},
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(true)
|
||||
expect(data[0].forms[0].transcriptions).toHaveLength(3)
|
||||
expect(data[0].forms[0].transcriptions.map(t => t.type)).toContain("pinyin")
|
||||
expect(data[0].forms[0].transcriptions.map(t => t.type)).toContain("numeric")
|
||||
expect(data[0].forms[0].transcriptions.map(t => t.type)).toContain("wadegiles")
|
||||
})
|
||||
|
||||
it("should return error for invalid JSON", () => {
|
||||
const json = "{ invalid json }"
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.imported).toBe(0)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
expect(result.errors[0].error).toContain("Invalid JSON")
|
||||
expect(data).toHaveLength(0)
|
||||
})
|
||||
|
||||
it("should return error for missing required fields", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "好",
|
||||
// Missing forms
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.failed).toBe(1)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
expect(data).toHaveLength(0)
|
||||
})
|
||||
|
||||
it("should return error for empty simplified field", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
})
|
||||
|
||||
it("should return error for empty meanings array", () => {
|
||||
const json = JSON.stringify({
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: [],
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.errors).toHaveLength(1)
|
||||
})
|
||||
|
||||
it("should continue parsing after errors", () => {
|
||||
const json = JSON.stringify([
|
||||
{
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
simplified: "", // Invalid
|
||||
forms: [
|
||||
{
|
||||
traditional: "x",
|
||||
transcriptions: { pinyin: "x" },
|
||||
meanings: ["x"],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
simplified: "爱",
|
||||
forms: [
|
||||
{
|
||||
traditional: "愛",
|
||||
transcriptions: { pinyin: "ài" },
|
||||
meanings: ["love"],
|
||||
},
|
||||
],
|
||||
},
|
||||
])
|
||||
|
||||
const { result, data } = parseHSKJson(json)
|
||||
|
||||
expect(result.success).toBe(false)
|
||||
expect(result.imported).toBe(2)
|
||||
expect(result.failed).toBe(1)
|
||||
expect(data).toHaveLength(2)
|
||||
})
|
||||
})
|
||||
|
||||
describe("validateHSKJsonEntry", () => {
|
||||
it("should validate correct entry", () => {
|
||||
const entry = {
|
||||
simplified: "好",
|
||||
forms: [
|
||||
{
|
||||
traditional: "好",
|
||||
transcriptions: { pinyin: "hǎo" },
|
||||
meanings: ["good"],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
const result = validateHSKJsonEntry(entry)
|
||||
|
||||
expect(result.valid).toBe(true)
|
||||
expect(result.errors).toHaveLength(0)
|
||||
})
|
||||
|
||||
it("should return errors for invalid entry", () => {
|
||||
const entry = {
|
||||
simplified: "",
|
||||
forms: [],
|
||||
}
|
||||
|
||||
const result = validateHSKJsonEntry(entry)
|
||||
|
||||
expect(result.valid).toBe(false)
|
||||
expect(result.errors.length).toBeGreaterThan(0)
|
||||
})
|
||||
})
|
||||
161
src/lib/import/hsk-json-parser.ts
Normal file
161
src/lib/import/hsk-json-parser.ts
Normal file
@@ -0,0 +1,161 @@
|
||||
import { z } from "zod"
|
||||
import type {
|
||||
HSKJsonEntry,
|
||||
HSKJsonForm,
|
||||
ParsedHanzi,
|
||||
ParsedHanziForm,
|
||||
ImportResult,
|
||||
ImportError,
|
||||
} from "./types"
|
||||
|
||||
/**
|
||||
* Zod schema for HSK JSON validation
|
||||
*/
|
||||
const HSKJsonFormSchema = z.object({
|
||||
traditional: z.string().min(1),
|
||||
transcriptions: z.object({
|
||||
pinyin: z.string().min(1),
|
||||
numeric: z.string().optional(),
|
||||
wadegiles: z.string().optional(),
|
||||
}).catchall(z.string().optional()),
|
||||
meanings: z.array(z.string().min(1)).min(1),
|
||||
classifiers: z.array(z.string()).optional(),
|
||||
})
|
||||
|
||||
const HSKJsonEntrySchema = z.object({
|
||||
simplified: z.string().min(1),
|
||||
radical: z.string().optional(),
|
||||
level: z.array(z.string()).optional(),
|
||||
frequency: z.number().int().positive().optional(),
|
||||
pos: z.array(z.string()).optional(),
|
||||
forms: z.array(HSKJsonFormSchema).min(1),
|
||||
})
|
||||
|
||||
/**
|
||||
* Parse HSK JSON format
|
||||
* Source: https://github.com/drkameleon/complete-hsk-vocabulary
|
||||
*/
|
||||
export function parseHSKJson(jsonString: string): {
|
||||
result: ImportResult
|
||||
data: ParsedHanzi[]
|
||||
} {
|
||||
const errors: ImportError[] = []
|
||||
const parsed: ParsedHanzi[] = []
|
||||
let entries: unknown[]
|
||||
|
||||
// Parse JSON
|
||||
try {
|
||||
const data = JSON.parse(jsonString)
|
||||
entries = Array.isArray(data) ? data : [data]
|
||||
} catch (error) {
|
||||
return {
|
||||
result: {
|
||||
success: false,
|
||||
imported: 0,
|
||||
failed: 0,
|
||||
errors: [{ error: `Invalid JSON: ${error instanceof Error ? error.message : "Unknown error"}` }],
|
||||
},
|
||||
data: [],
|
||||
}
|
||||
}
|
||||
|
||||
// Validate and transform each entry
|
||||
for (let i = 0; i < entries.length; i++) {
|
||||
try {
|
||||
const entry = HSKJsonEntrySchema.parse(entries[i])
|
||||
const parsedEntry = transformHSKJsonEntry(entry)
|
||||
parsed.push(parsedEntry)
|
||||
} catch (error) {
|
||||
const simplified = (entries[i] as any)?.simplified || "unknown"
|
||||
const errorMessage = error instanceof z.ZodError
|
||||
? error.errors.map(e => `${e.path.join(".")}: ${e.message}`).join(", ")
|
||||
: error instanceof Error
|
||||
? error.message
|
||||
: "Unknown error"
|
||||
|
||||
errors.push({
|
||||
line: i + 1,
|
||||
character: simplified,
|
||||
error: errorMessage,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
result: {
|
||||
success: errors.length === 0,
|
||||
imported: parsed.length,
|
||||
failed: errors.length,
|
||||
errors,
|
||||
},
|
||||
data: parsed,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform HSK JSON entry to ParsedHanzi format
|
||||
*/
|
||||
function transformHSKJsonEntry(entry: HSKJsonEntry): ParsedHanzi {
|
||||
return {
|
||||
simplified: entry.simplified,
|
||||
radical: entry.radical,
|
||||
frequency: entry.frequency,
|
||||
hskLevels: entry.level || [],
|
||||
partsOfSpeech: entry.pos || [],
|
||||
forms: entry.forms.map((form, index) => transformHSKJsonForm(form, index === 0)),
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform HSK JSON form to ParsedHanziForm format
|
||||
*/
|
||||
function transformHSKJsonForm(form: HSKJsonForm, isDefault: boolean): ParsedHanziForm {
|
||||
// Extract transcriptions
|
||||
const transcriptions = Object.entries(form.transcriptions)
|
||||
.filter(([_, value]) => value !== undefined)
|
||||
.map(([type, value]) => ({
|
||||
type,
|
||||
value: value!,
|
||||
}))
|
||||
|
||||
// Parse meanings (can be semicolon-separated or array)
|
||||
const meanings = form.meanings.flatMap((meaningStr, index) =>
|
||||
meaningStr.split(";").map((m, subIndex) => ({
|
||||
languageCode: "en", // Default to English
|
||||
meaning: m.trim(),
|
||||
orderIndex: index * 100 + subIndex,
|
||||
}))
|
||||
)
|
||||
|
||||
return {
|
||||
traditional: form.traditional,
|
||||
isDefault,
|
||||
transcriptions,
|
||||
meanings,
|
||||
classifiers: form.classifiers || [],
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a single HSK JSON entry
|
||||
*/
|
||||
export function validateHSKJsonEntry(entry: unknown): {
|
||||
valid: boolean
|
||||
errors: string[]
|
||||
} {
|
||||
try {
|
||||
HSKJsonEntrySchema.parse(entry)
|
||||
return { valid: true, errors: [] }
|
||||
} catch (error) {
|
||||
if (error instanceof z.ZodError) {
|
||||
return {
|
||||
valid: false,
|
||||
errors: error.errors.map(e => `${e.path.join(".")}: ${e.message}`),
|
||||
}
|
||||
}
|
||||
return {
|
||||
valid: false,
|
||||
errors: [error instanceof Error ? error.message : "Unknown error"],
|
||||
}
|
||||
}
|
||||
}
|
||||
77
src/lib/import/types.ts
Normal file
77
src/lib/import/types.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
/**
|
||||
* Types for HSK JSON and CSV import formats
|
||||
*/
|
||||
|
||||
export interface HSKJsonForm {
|
||||
traditional: string
|
||||
transcriptions: {
|
||||
pinyin: string
|
||||
numeric?: string
|
||||
wadegiles?: string
|
||||
[key: string]: string | undefined
|
||||
}
|
||||
meanings: string[]
|
||||
classifiers?: string[]
|
||||
}
|
||||
|
||||
export interface HSKJsonEntry {
|
||||
simplified: string
|
||||
radical?: string
|
||||
level?: string[]
|
||||
frequency?: number
|
||||
pos?: string[]
|
||||
forms: HSKJsonForm[]
|
||||
}
|
||||
|
||||
export interface CSVRow {
|
||||
simplified: string
|
||||
traditional: string
|
||||
pinyin: string
|
||||
meaning: string
|
||||
hsk_level?: string
|
||||
radical?: string
|
||||
frequency?: string
|
||||
pos?: string
|
||||
classifiers?: string
|
||||
}
|
||||
|
||||
export interface ParsedHanzi {
|
||||
simplified: string
|
||||
radical?: string
|
||||
frequency?: number
|
||||
forms: ParsedHanziForm[]
|
||||
hskLevels: string[]
|
||||
partsOfSpeech: string[]
|
||||
}
|
||||
|
||||
export interface ParsedHanziForm {
|
||||
traditional: string
|
||||
isDefault: boolean
|
||||
transcriptions: ParsedTranscription[]
|
||||
meanings: ParsedMeaning[]
|
||||
classifiers: string[]
|
||||
}
|
||||
|
||||
export interface ParsedTranscription {
|
||||
type: string
|
||||
value: string
|
||||
}
|
||||
|
||||
export interface ParsedMeaning {
|
||||
languageCode: string
|
||||
meaning: string
|
||||
orderIndex: number
|
||||
}
|
||||
|
||||
export interface ImportResult {
|
||||
success: boolean
|
||||
imported: number
|
||||
failed: number
|
||||
errors: ImportError[]
|
||||
}
|
||||
|
||||
export interface ImportError {
|
||||
line?: number
|
||||
character?: string
|
||||
error: string
|
||||
}
|
||||
Reference in New Issue
Block a user