Go语言实现拼音转汉字

预先说明:以项目名为 x 作为示例

第一步:在GitHub找个汉字拼音字典(随便找了一个输入法项目的字典)

https://github.com/MobtgZhang/LingJian-Pinyin/blob/main/data/pinyin_dict.txt

第二步:把字典下载下来放在项目里,例如:data/pinyin_dict.txt

第三步:创建 com/pinyin_han.go 文件,把代码复制进去

x/com/pinyin_han.go

go 复制代码
package com

import (
	"bufio"
	"os"
	"strings"
	"sync"
)

type PinyinHan struct {
	pyToHan     map[string][]string
	abbrToHan   map[string][]string // 简拼映射
	noToneToHan map[string][]string // 无声调映射
	mutex       sync.RWMutex
}

func NewPinyinHan() *PinyinHan {
	return &PinyinHan{
		pyToHan:     make(map[string][]string),
		abbrToHan:   make(map[string][]string),
		noToneToHan: make(map[string][]string),
	}
}

func (p *PinyinHan) LoadDictionary(filename string) error {
	p.mutex.Lock()
	defer p.mutex.Unlock()

	file, err := os.Open(filename)
	if err != nil {
		return err
	}
	defer func(file *os.File) {
		_ = file.Close()
	}(file)

	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		line := strings.TrimSpace(scanner.Text())
		if line == "" {
			continue
		}

		parts := strings.Fields(line)
		if len(parts) >= 2 {
			pinyin := parts[0]
			hanzi := parts[1]

			// 标准拼音映射
			p.pyToHan[pinyin] = append(p.pyToHan[pinyin], hanzi)

			// 构建简拼映射
			abbr := p.getAbbreviation(pinyin)
			p.abbrToHan[abbr] = append(p.abbrToHan[abbr], hanzi)

			// 构建无声调映射
			noTone := p.removeTone(pinyin)
			p.noToneToHan[noTone] = append(p.noToneToHan[noTone], hanzi)
		}
	}

	return scanner.Err()
}

func (p *PinyinHan) getAbbreviation(pinyin string) string {
	syllables := strings.Fields(pinyin)
	var abbr strings.Builder
	for _, syl := range syllables {
		if len(syl) > 0 {
			abbr.WriteByte(syl[0])
		}
	}
	return abbr.String()
}

func (p *PinyinHan) removeTone(pinyin string) string {
	// 简单的声调数字去除
	result := make([]byte, 0, len(pinyin))
	for i := 0; i < len(pinyin); i++ {
		if pinyin[i] < '1' || pinyin[i] > '4' {
			result = append(result, pinyin[i])
		}
	}
	return string(result)
}

func (p *PinyinHan) Convert(pinyin string) []string {
	p.mutex.RLock()
	defer p.mutex.RUnlock()

	// 优先级:精确匹配 > 无声调匹配 > 简拼匹配
	if result, exists := p.pyToHan[pinyin]; exists {
		return result
	}

	noTone := p.removeTone(pinyin)
	if result, exists := p.noToneToHan[noTone]; exists {
		return result
	}

	if result, exists := p.abbrToHan[pinyin]; exists {
		return result
	}

	return nil
}

func (p *PinyinHan) Run(dictPath string, pinyin []string) ([]string, error) {
	err := p.LoadDictionary(dictPath)
	if err != nil {
		return nil, err
	}

	var out = make([]string, 0)
	for _, v := range pinyin {
		v = strings.ReplaceAll(v, " ", "")
		result := p.Convert(v)
		for _, s := range result {
			out = append(out, s)
		}
	}
	return out, nil
}

第四步:直接调用(以Test为例)

x/test/pinyin_han_test.go

go 复制代码
package test

import (
	"testing"
	"x/com"
)

func TestPinyinHan(t *testing.T) {

	var pinyin = []string{"guangdong", "hainan", "fujian", "taiwan", "a"}

	han, err := com.NewPinyinHan().Run("../data/pinyin_dict.txt", pinyin)

	if err != nil {
		t.Error(err)
		return
	}

	t.Log(han)
}

输出结果:

说句题外话,其实字典可以根据业务在做一些轻量化定制,例如我的项目里,只有检索功能才用到,我会根据文本内容用到一些分词器、拼音转换器生成自定义数据写进字典里。

评论