青空文庫からPythonで本文を取得したい

更新日：2018年4月8日

青空文庫からPythonで本文を取得したい

青空文庫からPythonで本文を取得するコードです。作業ステップは以下の通りです。

① ダウンロードしたいURLのZIPを指定する。
② プログラム直下にZIPディレクトリを作成し解凍する。
③ ZIPディレクトリ配下のテキストファイルを読み込む
④ テキストファイルのルビや注釈などを削除して本文のみを取得する。

青空文庫URL：https://www.aozora.gr.jp/

サンプルとして青空文庫の三国志をダウンロードしてみます。

※使用する際には青空文庫の利用規約に基づき使用してください。

コード

import re
import zipfile
import urllib.request
import os.path,glob

#ダウンロードしたいURLを入力する
URL = 'https://www.aozora.gr.jp/cards/001562/files/52410_ruby_51060.zip'

def main():
    download_text = download(URL)
    text = convert(download_text)
    print(text)

def convert(download_text):
    binarydata = open(download_text, 'rb').read()
    text = binarydata.decode('shift_jis')

    # ルビ、注釈などの除去
    text = re.split(r'\-{5,}', text)[2]
    text = re.split(r'底本：', text)[0]
    text = re.sub(r'《.+?》', '', text)
    text = re.sub(r'［＃.+?］', '', text)
    text = text.strip()
    return text

def download(url):
 # データファイルをダウンロードする
 zip_file = re.split(r'/', url)[-1]

 if not os.path.exists(zip_file):
     print('Download URL')
     print('URL:',url)
     urllib.request.urlretrieve(url, zip_file)
 else:
     print('Download File exists')

 # フォルダの生成
 dir, ext = os.path.splitext(zip_file)
 if not os.path.exists(dir):
     os.makedirs(dir)

 # zipファイルの展開
 zip_obj = zipfile.ZipFile(zip_file, 'r')
 zip_obj.extractall(dir)
 zip_obj.close()

 # zipファイルの削除
 os.remove(zip_file)

 # テキストファイルの抽出
 path = os.path.join(dir,'*.txt')
 list = glob.glob(path)
 return list[0]

if __name__ == "__main__":
    main()

import re

import zipfile

import urllib.request

import os.path,glob

#ダウンロードしたいURLを入力する

URL = 'https://www.aozora.gr.jp/cards/001562/files/52410_ruby_51060.zip'

def main():

download_text = download(URL)

text = convert(download_text)

print(text)

def convert(download_text):

binarydata = open(download_text, 'rb').read()

text = binarydata.decode('shift_jis')

# ルビ、注釈などの除去

text = re.split(r'\-{5,}', text)[2]

text = re.split(r'底本：', text)[0]

text = re.sub(r'《.+?》', '', text)

text = re.sub(r'［＃.+?］', '', text)

text = text.strip()

return text

def download(url):

# データファイルをダウンロードする

zip_file = re.split(r'/', url)[-1]

if not os.path.exists(zip_file):

print('Download URL')

print('URL:',url)

urllib.request.urlretrieve(url, zip_file)

else:

print('Download File exists')

# フォルダの生成

dir, ext = os.path.splitext(zip_file)

if not os.path.exists(dir):

os.makedirs(dir)

# zipファイルの展開

zip_obj = zipfile.ZipFile(zip_file, 'r')

zip_obj.extractall(dir)

zip_obj.close()

# zipファイルの削除

os.remove(zip_file)

# テキストファイルの抽出

path = os.path.join(dir,'*.txt')

list = glob.glob(path)

return list[0]

if __name__ == "__main__":

main()