한영타 변환기를 만들어보자! [1] (알고리즘 구성)

python

한영타 변환기를 만들어보자! [1] (알고리즘 구성)

J-Mook 2022. 7. 4. 15:14

한영타 변환기를 만들기 위해 설계된 기본적인 구조는

이렇게 된다.

첫번째로 영타를 한국어로 변환해 주었다. 각 영타에 대응되는 한국어를 한글자씩 딕셔너리에 넣어주고, 입력된 영타를 읽어 한글로 변환하였다.

ko_dict = {'q':'ㅂ', 'Q':'ㅃ', 'w':'ㅈ', 'W':'ㅉ', 
        'e':'ㄷ', 'E':'ㄸ', 'r':'ㄱ', 'R':'ㄲ', 't':'ㅅ', 
        'T':'ㅆ', 'y':'ㅛ', 'u':'ㅕ', 'i':'ㅑ', 'o':'ㅐ', 
        'p':'ㅔ', 'a':'ㅁ', 's':'ㄴ', 'd':'ㅇ', 'f':'ㄹ', 
        'g':'ㅎ', 'h':'ㅗ', 'j':'ㅓ', 'k':'ㅏ', 'l':'ㅣ', 
        'z':'ㅋ', 'x':'ㅌ', 'c':'ㅊ', 'v':'ㅍ', 'b':'ㅠ', 
        'n':'ㅜ', 'm':'ㅡ', 'O':'ㅒ', 'P':'ㅖ', 'Y':'ㅛ', 
        'U':'ㅕ', 'I':'ㅑ', 'H':'ㅗ', 'J':'ㅓ', 'K':'ㅏ', 
        'L':'ㅣ', 'B':'ㅠ', 'N':'ㅜ', 'M':'ㅡ', 'A':'ㅁ',
        'S':'ㄴ', 'D':'ㅇ', 'F':'ㄹ', 'G':'ㅎ', 'Z':'ㅋ',
        'X':'ㅌ', 'C':'ㅊ', 'V':'ㅍ'}
        
for c in main_input:
        try:
            ko_word.append(ko_dict[c])
        except:
            ko_word.append(c)
    ko_word = list(''.join(ko_word)) + ['\n']

그렇게 만들어진 한글배열을 음절별로 끊어주었고, 끊어주는 기준으로는 자음 한개, 모음 한개가 연속으로 올때 그 앞을 기준으로 나누어 주었다.

words = []
    start = 0
    for i in range(1, len(ko_word)):
        if (i == len(ko_word)-1):
            words.append(ko_word[start:len(ko_word)])
        elif (ko_word[i] in choseong_list and ko_word[i+1] in jungseong_list) or (ko_word[i] not in choseong_list and ko_word[i] not in jungseong_list):
            words.append(ko_word[start:i])
            start = i

알파벳과 한글의 가장 큰 차이는 이중모음과 이중자음이 있다는 것이다. 영타에서는 이부분이 구분되지않고 입력되기 때문에, 이부분을 일일히 합쳐줘야한다.

다행히 초성에서는 쌍 자음만 나오기 때문에 맨 처음의 딕셔너리에서 처리해 주었고, 중성과 종성에서 조합되는 이중자음, 이중모음은 따로 함수로 만들어 처리하였다.

for word in words:
    if len(word) > 2 and word[0] in choseong_list and word[1] in jungseong_list:
        if word[1] in jungseong_list and word[2] in jungseong_list:
            b = word[1]
            word[1] = make_jungseong_list(word[1:3])
            if (b != word[1]):
                word.pop(2)
        if word[-1] in jongseong_list and word[-2] in jongseong_list:
            b = word[-2]
            word[2] = make_jongseong_list(word[-2:])
            if (b != word[-2]):
                word.pop(-1)
                    
def make_jongseong_list(char_list):
    if char_list[0] == 'ㄱ' and char_list[1] == 'ㄱ':
        return "ㄲ"
    if char_list[0] == 'ㄱ' and char_list[1] == 'ㅅ':
        return "ㄳ"
    if char_list[0] == 'ㄴ' and char_list[1] == 'ㅈ':
        return "ㄵ"
    if char_list[0] == 'ㄴ' and char_list[1] == 'ㅎ':
        return "ㄶ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㄱ':
        return "ㄺ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅁ':
        return "ㄻ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅂ':
        return "ㄼ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅅ':
        return "ㄽ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅌ':
        return "ㄾ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅍ':
        return "ㄿ"
    if char_list[0] == 'ㄹ' and char_list[1] == 'ㅎ':
        return "ㅀ"
    if char_list[0] == 'ㅂ' and char_list[1] == 'ㅅ':
        return "ㅄ"
    return char_list[0]

def make_jungseong_list(char_list):
    if char_list[0]=='ㅗ' and char_list[1] == 'ㅏ':
        return "ㅘ"
    if char_list[0]=='ㅗ' and char_list[1] == 'ㅐ':
        return "ㅙ"
    if char_list[0]=='ㅗ' and char_list[1] == 'ㅣ':
        return "ㅚ"
    if char_list[0]=='ㅜ' and char_list[1] == 'ㅓ':
        return "ㅝ"
    if char_list[0]=='ㅜ' and char_list[1] == 'ㅔ':
        return "ㅞ"
    if char_list[0]=='ㅜ' and char_list[1] == 'ㅣ':
        return "ㅟ"
    if char_list[0]=='ㅡ' and char_list[1] == 'ㅣ':
        return "ㅢ"
    return char_list[0]

마지막 과정인 한타변환은 유니코드로 변환하는 부분인데, 유니코드에서 한글은 특정 패턴으로 구성되기 때문에 0xAC00 + 종성번호 + (초성번호 * 21 * 28) + (중성번호 * 28) 식을 사용해서 계산했다.

두번째 과정(음절로 묶기)에서 사용한 방식의 가장 큰 문제는, '옹ㅇㅇㅇㅇ'같이 자음+모음+자음+자음+자음... 과 같은 구조를 한번에 묶는 다는 것이다. 이 과정에서 뒷부분 자음이 출력되지 않는 문제가 발생하므로, 유니코드 계산 후 남아있는 문자들을 그대로 모두 출력하는 while문을 사용하여 문제를 해결하였다.

output_list = []
for char in words:
    jongseong_index = 0
    if len(char) > 1 and char[0] in choseong_list and char[1] in jungseong_list:
        choseong_index = choseong_list.index(char.pop(0))
        jungseong_index = jungseong_list.index(char.pop(0))
        if len(char) > 0 and char[0] in jongseong_list:
            jongseong_index = jongseong_list.index(char.pop(0))
        character_code = jongseong_index + 0xAC00 + (choseong_index * 21 * 28) + (jungseong_index * 28)
        output_list.append(chr(character_code))
    while char:
        output_list.append(char.pop(0))

그렇게 만들어진 한타 배열을 출력하면 기본적인 구조는 완성되었다!!

print(''.join(output_list))

중간 이중모음, 이중자음 처리 함수를 제외한 전체 코드를 함수화 하면,

def en2ko(main_input):
    start_time = time.time()
    # convert en 2 ko
    ko_word = []
    for c in main_input:
        try:
            ko_word.append(ko_dict[c])
        except:
            ko_word.append(c)
    ko_word = list(''.join(ko_word)) + ['\n']
    print(ko_word)

    # seperate by one letter
    words = []
    start = 0
    for i in range(1, len(ko_word)):
        if (i == len(ko_word)-1):
            words.append(ko_word[start:len(ko_word)])
        elif (ko_word[i] in choseong_list and ko_word[i+1] in jungseong_list) or (ko_word[i] not in choseong_list and ko_word[i] not in jungseong_list):
            words.append(ko_word[start:i])
            start = i
    print(words)
    
    # convert dubble letter
    for word in words:
        if len(word) > 2 and word[0] in choseong_list and word[1] in jungseong_list:
            if word[1] in jungseong_list and word[2] in jungseong_list:
                b = word[1]
                word[1] = make_jungseong_list(word[1:3])
                if (b != word[1]):
                    word.pop(2)
            if word[-1] in jongseong_list and word[-2] in jongseong_list:
                b = word[-2]
                word[2] = make_jongseong_list(word[-2:])
                if (b != word[-2]):
                    word.pop(-1)
    print(words)
    
    #combine each letter
    output_list = []
    for char in words:
        jongseong_index = 0
        if len(char) > 1 and char[0] in choseong_list and char[1] in jungseong_list:
            choseong_index = choseong_list.index(char.pop(0))
            jungseong_index = jungseong_list.index(char.pop(0))
            if len(char) > 0 and char[0] in jongseong_list:
                jongseong_index = jongseong_list.index(char.pop(0))
            character_code = jongseong_index + 0xAC00 + (choseong_index * 21 * 28) + (jungseong_index * 28)
            output_list.append(chr(character_code))
        while char:
            output_list.append(char.pop(0))

    print("{}\t|    (변환)    |\n{}".format(main_input, ''.join(output_list)))
    print('{} : {}'.format(len(main_input), time.time() - start_time))
    return ''.join(output_list)

위와 같은 구조가 완성되었따!!!

이제 입력과 출력을 할 수있는 인터페이스를 만들어 볼 것이다.

https://github.com/J-Mook/en2ko

GitHub - J-Mook/en2ko

Contribute to J-Mook/en2ko development by creating an account on GitHub.

github.com

'python' 카테고리의 다른 글

한영타 변환기를 만들어보자! [3] (Pyinstaller로 실행파일 만들기) (2)	2022.07.04
한영타 변환기를 만들어보자! [2] (tkinter로 UI만들기) (1)	2022.07.04
Recurrence plot (시계열 데이터를 학습시켜보자!) (0)	2022.05.25
python 경로 및 현재 경로의 파일 리스트 (0)	2022.01.07
Open3d - Hidden Point Removal (+논문리뷰) (0)	2021.09.07

현재글한영타 변환기를 만들어보자! [1] (알고리즘 구성)

이것 저것 많이하는 J-Mook의 블로그

XE2, 대전, 코딩, 올드렌즈, 42서울, 현대, 후쿠오카, 신사, 텐진, Python, 파이썬, 개발자, 애플, e2, 후지필름, 사진, 모터쇼, 맛집, 일본여행, 천체사진,

Today :
Yesterday :

일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30	31

J-Mook의 블로그