欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

【自然语言实战】·第二章(1.1)——获取词语首字字母

程序员文章站 2024-02-28 11:49:40
...

一、maven依赖

        <dependency>
            <groupId>net.sourceforge.pinyin4j</groupId>
            <artifactId>pinyin4j</artifactId>
            <version>2.5.0</version>
        </dependency>

二、示例代码

import com.pingan.lcloud.ark.log.LoggerUtil;
import net.sourceforge.pinyin4j.PinyinHelper;
import org.apache.commons.lang3.CharUtils;
import org.apache.commons.lang3.StringUtils;

import java.lang.annotation.Native;
import java.util.Objects;

/**
 * <code>Details determine success.</code>
 * by Liang ZC., [email protected]
 * 中文工具类
 *
 * @author LIANGZHICHENG035
 * @date 2019-11-6 15:57
 * @see http://www.stanford.edu
 */
public class ChineseUtils {
    /*
     *           N777777777NO
     *         N7777777777777N
     *        M777777777777777N
     *        *N877777777D77777M
     *       N M77777777ONND777M
     *       MN777777777NN  D777
     *     N7ZN777777777NN ~M7778
     *    N777777777777MMNN88777N
     *    N777777777777MNZZZ7777O
     *    DZN7777O77777777777777
     *     N7OONND7777777D77777N
     *      8*M++++?N???$77777$
     *       M7++++N+M77777777N
     *        N77O777777777777$                              M
     *          DNNM$$$$777777N                              D
     *         N*N:=N$777N7777M                             NZ
     *        77Z::::N777777777                          ODZZZ
     *       77N::::::N77777777M                         NNZZZ$
     *     $777:::::::77777777MN                        ZM8ZZZZZ
     *     777M::::::Z7777777Z77                        N++ZZZZNN
     *    7777M:::::M7777777$777M                       $++IZZZZM
     *   M777$:::::N777777*M7777M                       +++++ZZZDN
     *     NN$::::::7777$*M777777N                      N+++ZZZZNZ
     *       N::::::N:7*O:77777777                      N++++ZZZZN
     *       M::::::::::::N77777777+                   +?+++++ZZZM
     *       8::::::::::::D77777777M                    O+++++ZZ
     *        ::::::::::::M777777777N                      O+?D
     *        M:::::::::::M77777777778                     77=
     *        D=::::::::::N7777777777N                    777
     *       INN===::::::=77777777777N                  I777N
     *      ?777N========N7777777777787M               N7777
     *      77777*D======N77777777777N777N?         N777777
     *     I77777$$*N7===M$$77777777$77777777*MMZ77777777N
     *      $$$$$$$$$$*NIZN$$$$$$$$*M$$7777777777777777ON
     *       M$$$$$$$*M    M$$$$$$$*N=N$$$$7777777$$*ND
     *      O77Z$$$$$$$     M$$$$$$$*MNI==*DNNNNM=~N
     *   7 :N MNN$$$*M$      $$$777$8      8D8I
     *     NMM.:7O           777777778
     *                       7777777MN
     *                       M NO .7:
     *                       M   :   M
     *                            8
     */

    // Constant matcher factory methods

    public ChineseUtils() {
    }

    private static final String PUNCTUATION = "\\pP";

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one, if the chinese initail is empty return {@param defaultValue}.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("爱中国"))	= 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国"))		= 	Z
     *  ChineseUtils.getChineseInitial("@#国"))		= 	G
     *  ChineseUtils.getChineseInitial("国%$"))		= 	G
     *  ChineseUtils.getChineseInitial("国"))		= 	G
     *  ChineseUtils.getChineseInitial("W我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国"))	= 	I
     *  ChineseUtils.getChineseInitial("null"))     = 	N
     *  ChineseUtils.getChineseInitial(null))       = 	""
     *  ChineseUtils.getChineseInitial(""))         = 	""
     *  ChineseUtils.getChineseInitial(","))        =	""
     * <pre/>
     *
     * @param chinese
     * @param defaultValue
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitialDefaultIfEmpty(String chinese, String defaultValue) {
        String result = getChineseInitial(chinese, true);
        return StringUtils.isEmpty(result) ? defaultValue : result;
    }

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("爱中国"))	= 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国"))		= 	Z
     *  ChineseUtils.getChineseInitial("@#国"))		= 	G
     *  ChineseUtils.getChineseInitial("国%$"))		= 	G
     *  ChineseUtils.getChineseInitial("国"))		= 	G
     *  ChineseUtils.getChineseInitial("W我爱中国"))	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国"))	= 	I
     *  ChineseUtils.getChineseInitial("null"))     = 	N
     *  ChineseUtils.getChineseInitial(null))       = 	""
     *  ChineseUtils.getChineseInitial(""))         = 	""
     *  ChineseUtils.getChineseInitial(","))        =	""
     * <pre/>
     *
     * @param chinese
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitial(String chinese) {
        return getChineseInitial(chinese, true);
    }

    /***
     * <p>get chinese initail, if the first char is number return the number, if it is a polysyllabic character,
     * take only the first one.<p/>
     *
     * <pre>
     *  ChineseUtils.getChineseInitial("我爱中国")	= 	W
     *  ChineseUtils.getChineseInitial("爱中国")	    = 	A
     *  ChineseUtils.getChineseInitial("1爱中国")	= 	1
     *  ChineseUtils.getChineseInitial("中国")		= 	Z
     *  ChineseUtils.getChineseInitial("@#国")		= 	G
     *  ChineseUtils.getChineseInitial("国%$"		= 	G
     *  ChineseUtils.getChineseInitial("国")		    = 	G
     *  ChineseUtils.getChineseInitial("W我爱中国")	= 	W
     *  ChineseUtils.getChineseInitial("I我爱中国")	= 	I
     *  ChineseUtils.getChineseInitial("null")      = 	N
     *  ChineseUtils.getChineseInitial(null)        = 	""
     *  ChineseUtils.getChineseInitial("")          = 	""
     *  ChineseUtils.getChineseInitial(",")         =	""
     * <pre/>
     *
     * @param chinese
     * @param removePunctuation is remove the punctuation in {@param chinese}.
     * @return the pinyin of first chinese char,if {@param chinese} is't chinese,return {@link StringUtils.EMPTY}.
     */
    public static String getChineseInitial(String chinese, boolean removePunctuation) {
        // if need remove punctuation.
        if (removePunctuation) {
            chinese = removePunctuation(chinese);
        }
        // if chinese is blank
        if (StringUtils.isBlank(chinese)) {
            return StringUtils.EMPTY;
        }
        // first char.
        char firstChar = chinese.charAt(0);
        // if first char in [a-z,A-Z,0-9]
        if (CharUtils.isAsciiAlphanumeric(firstChar)) {
            return CharUtils.toString(firstChar).toUpperCase();
        }
        // if is chinese.
        boolean isChinese = CharUtils.toString(firstChar).matches("[\u4E00-\u9FA5]+");
        if (!isChinese) {
            return StringUtils.EMPTY;
        }
        // chinese to pinyin and get first char.
        try {
            String[] res = PinyinHelper.toHanyuPinyinStringArray(firstChar);
            return res[0].substring(0, 1).toUpperCase();
        } catch (Exception e) {
            LoggerUtil.warn("get " + chinese + " chinese initial fail.", e);
        }

        return StringUtils.EMPTY;
    }

    /**
     * <p>replace {@param str} punctuation to "", if {@param str} is empty , return {@link StringUtils.EMPTY}.</p>
     *
     * <pre>
     *  ChineseUtils.removePunctuation(null)			=	""
     *  ChineseUtils.removePunctuation("")				=	""
     *  ChineseUtils.removePunctuation(" ")				=	""
     *  ChineseUtils.removePunctuation("我爱中国")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国!")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国。")		=	我爱中国
     *  ChineseUtils.removePunctuation("我爱中国.")		=	我爱中国
     *  ChineseUtils.removePunctuation("  我爱中国.  ")	=	我爱中国
     * </pre>
     *
     * @param str
     * @return string
     */
    public static String removePunctuation(String str) {
        if (StringUtils.isEmpty(str)) {
            return StringUtils.EMPTY;
        }

        return str.trim().replaceAll(PUNCTUATION, StringUtils.EMPTY);
    }

}

三、运行结果

    public static void main(String[] args) {
        System.out.println(ChineseUtils.getChineseInitial("我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("1爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("中国"));
        System.out.println(ChineseUtils.getChineseInitial("@#国"));
        System.out.println(ChineseUtils.getChineseInitial("国%$"));
        System.out.println(ChineseUtils.getChineseInitial("国"));
        System.out.println(ChineseUtils.getChineseInitial("W我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("I我爱中国"));
        System.out.println(ChineseUtils.getChineseInitial("null"));
        System.out.println(ChineseUtils.getChineseInitial(null));
        System.out.println(ChineseUtils.getChineseInitial(""));
        System.out.println(ChineseUtils.getChineseInitial(","));
    }
W
A
1
Z
G
G
G
W
I
N
相关标签: 自然语言处理