macOS에서 한글 파일 텍스트 추출하고 자동 요약하는 방법

macOS에서 한글 파일을 다루려면 먼저 Python 환경을 설정해야 해요. 터미널을 열고 필요한 패키지를 설치하세요.

# Homebrew로 Python 설치 (이미 있다면 생략)
brew install python

# 가상환경 생성 및 활성화
python3 -m venv hwp_env
source hwp_env/bin/activate

# 필요한 패키지 설치
pip install pyhwpx openai

pyhwp 대신 pyhwpx를 사용하는 이유는 macOS 호환성이 더 좋기 때문이에요. 원본 pyhwp는 Windows 환경에 최적화되어 있어서 macOS에서는 설치 오류가 자주 발생해요.

HWP 파일을 텍스트로 변환하는 코드

import os
import pyhwpx
import openai
from pathlib import Path

# OpenAI API 키 설정
openai.api_key = os.environ.get('OPENAI_API_KEY')

def extract_hwp_text(hwp_file_path):
    """HWP 파일에서 텍스트를 추출하는 함수"""
    try:
        hwp = pyhwpx.Hwp()
        hwp.open(hwp_file_path)
        
        # 모든 페이지의 텍스트 추출
        text_content = ""
        for section in hwp.sections():
            for paragraph in section.paragraphs():
                text_content += paragraph.text() + "\n"
        
        hwp.quit()
        return text_content
    except Exception as e:
        print(f"오류 발생: {e}")
        return None

def save_text_file(text_content, output_path):
    """추출한 텍스트를 파일로 저장"""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text_content)

macOS 터미널에서 환경 변수로 OpenAI API 키를 설정하는 방법도 알아두세요. .zshrc 파일에 다음 줄을 추가하면 돼요.

echo 'export OPENAI_API_KEY="your-api-key-here"' >> ~/.zshrc
source ~/.zshrc

디렉토리 내 모든 HWP 파일 일괄 처리

def process_directory(input_dir, output_dir):
    """디렉토리 내 모든 HWP 파일을 처리"""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    
    # 출력 디렉토리가 없으면 생성
    output_path.mkdir(parents=True, exist_ok=True)
    
    # HWP 파일 찾기
    hwp_files = list(input_path.glob('**/*.hwp'))
    
    for hwp_file in hwp_files:
        print(f"처리 중: {hwp_file.name}")
        
        # 텍스트 추출
        text_content = extract_hwp_text(str(hwp_file))
        
        if text_content:
            # 출력 파일 경로 생성
            txt_file = output_path / f"{hwp_file.stem}.txt"
            save_text_file(text_content, str(txt_file))
            print(f"저장 완료: {txt_file.name}")

GPT API를 사용한 텍스트 요약

텍스트가 길 경우 청크로 나누어 처리하는 것이 효율적이에요. API 호출 비용도 절약할 수 있고, 응답 품질도 더 좋아져요.

def split_text_into_chunks(text, max_tokens=2000):
    """긴 텍스트를 청크로 분할"""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        current_length += len(word) + 1
        if current_length > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

def summarize_with_gpt(text, model="gpt-4"):
    """GPT API를 사용해 텍스트 요약"""
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {
                    "role": "system", 
                    "content": "당신은 한국어 문서를 간결하고 정확하게 요약하는 전문가입니다."
                },
                {
                    "role": "user", 
                    "content": f"다음 문서를 핵심 내용 위주로 요약해주세요:\n\n{text}"
                }
            ],
            temperature=0.3,
            max_tokens=500
        )
        return response.choices[0].message['content']
    except Exception as e:
        print(f"요약 중 오류: {e}")
        return None

전체 프로세스 통합 실행

모든 기능을 하나로 묶어서 실행하는 메인 함수를 만들어요. 진행 상황을 표시하는 프로그레스 바도 추가하면 더 좋아요.

from tqdm import tqdm

def main():
    """전체 프로세스 실행"""
    # 경로 설정
    base_dir = Path.home() / "Documents" / "hwp_project"
    input_dir = base_dir / "hwp_files"
    txt_dir = base_dir / "txt_files"
    summary_dir = base_dir / "summaries"
    
    # 디렉토리 생성
    for dir_path in [txt_dir, summary_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    # 1단계: HWP → TXT 변환
    print("HWP 파일을 텍스트로 변환 중...")
    process_directory(input_dir, txt_dir)
    
    # 2단계: 텍스트 파일 요약
    print("\n텍스트 파일 요약 중...")
    txt_files = list(txt_dir.glob('*.txt'))
    
    for txt_file in tqdm(txt_files, desc="요약 진행"):
        with open(txt_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # 텍스트가 너무 길면 청크로 분할
        if len(content) > 3000:
            chunks = split_text_into_chunks(content)
            summaries = []
            
            for chunk in chunks:
                summary = summarize_with_gpt(chunk)
                if summary:
                    summaries.append(summary)
            
            # 청크 요약을 하나로 합치기
            final_summary = "\n\n".join(summaries)
        else:
            final_summary = summarize_with_gpt(content)
        
        # 요약 저장
        if final_summary:
            summary_file = summary_dir / f"summary_{txt_file.name}"
            with open(summary_file, 'w', encoding='utf-8') as f:
                f.write(final_summary)

if __name__ == "__main__":
    main()

실행 시 주의사항과 팁

macOS에서 한글 파일을 다룰 때는 인코딩 문제가 자주 발생해요. UTF-8로 통일하는 것이 가장 안전해요. 파일을 읽거나 쓸 때 항상 encoding='utf-8'을 명시하세요.

# 인코딩 오류 처리 예시
def safe_read_file(file_path):
    """다양한 인코딩으로 파일 읽기 시도"""
    encodings = ['utf-8', 'cp949', 'euc-kr']
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
    
    # 모든 인코딩이 실패하면 바이너리로 읽어서 디코딩
    with open(file_path, 'rb') as f:
        content = f.read()
        return content.decode('utf-8', errors='ignore')

병렬 처리로 속도 향상시키기

파일이 많을 때는 병렬 처리를 사용하면 작업 시간을 크게 단축할 수 있어요.

from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing

def process_single_file(hwp_file, output_dir):
    """단일 파일 처리 함수"""
    try:
        text = extract_hwp_text(str(hwp_file))
        if text:
            output_file = output_dir / f"{hwp_file.stem}.txt"
            save_text_file(text, str(output_file))
            return f"성공: {hwp_file.name}"
    except Exception as e:
        return f"실패: {hwp_file.name} - {str(e)}"

def parallel_process_files(input_dir, output_dir):
    """병렬로 여러 파일 처리"""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    hwp_files = list(input_path.glob('**/*.hwp'))
    
    # CPU 코어 수만큼 워커 생성
    max_workers = multiprocessing.cpu_count()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_single_file, hwp_file, output_path): hwp_file
            for hwp_file in hwp_files
        }
        
        for future in as_completed(futures):
            result = future.result()
            print(result)

API 호출 횟수를 줄이려면 로컬 요약 모델을 사용하는 방법도 있어요. Hugging Face의 한국어 요약 모델을 활용하면 비용 없이 요약할 수 있어요.

from transformers import pipeline

# 한국어 요약 모델 로드
summarizer = pipeline("summarization", model="gogamza/kobart-summarization")

def local_summarize(text):
    """로컬 모델로 요약"""
    # 모델 입력 길이 제한이 있으므로 청크로 나누기
    max_length = 1024
    
    if len(text) > max_length:
        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
        summaries = []
        
        for chunk in chunks:
            result = summarizer(chunk, max_length=150, min_length=50)
            summaries.append(result[0]['summary_text'])
        
        return ' '.join(summaries)
    else:
        result = summarizer(text, max_length=150, min_length=50)
        return result[0]['summary_text']

2025.07.24 - [생산성] - 노션에서 매주 월요일 자동으로 주간 목표 페이지 만드는 방법

노션에서 매주 월요일 자동으로 주간 목표 페이지 만드는 방법

매주 월요일마다 새로운 주간 목표 페이지를 수동으로 만드는 것은 번거로운 일이에요. Python과 Notion API를 활용하면 이 과정을 완전히 자동화할 수 있어요. 템플릿 페이지를 복사하고 날짜를 자

qwanjk.tistory.com

저작자표시 비영리 변경금지 (새창열림)

'IT' 카테고리의 다른 글

Alfred 워크플로로 메모 자동화하면 생산성이 얼마나 올라갈까요? (2)	2025.07.24
매일 반복하는 앱 실행이 귀찮아서 Raycast 단축키로 자동화해봤어요 (1)	2025.07.24
노션에서 매주 월요일 자동으로 주간 목표 페이지 만드는 방법 (3)	2025.07.24
파이썬 schedule로 점심시간 Mac 알림과 슬랙 메시지 자동 전송하는 방법 (2)	2025.07.24
Claude MCP 서버 설치하고 파일시스템 연동하는 완벽 가이드 (9)	2025.07.23

손끝소식

macOS에서 한글 파일 텍스트 추출하고 자동 요약하는 방법

HWP 파일을 텍스트로 변환하는 코드

디렉토리 내 모든 HWP 파일 일괄 처리

GPT API를 사용한 텍스트 요약

전체 프로세스 통합 실행

실행 시 주의사항과 팁

병렬 처리로 속도 향상시키기

'IT' 카테고리의 다른 글

티스토리툴바

macOS에서 한글 파일 텍스트 추출하고 자동 요약하는 방법

HWP 파일을 텍스트로 변환하는 코드

디렉토리 내 모든 HWP 파일 일괄 처리

GPT API를 사용한 텍스트 요약

전체 프로세스 통합 실행

실행 시 주의사항과 팁

병렬 처리로 속도 향상시키기

'IT' 카테고리의 다른 글

관련글

티스토리툴바