python_tweets.json (python数据挖掘入门与实践数据集下载)


最近在看python数据挖掘入门与实践一书,书不错,有个不好的地方是,书上所用的数据集,有几个测试数据在网上非常不好找

下面几个资源是我自己整理出来的,上传到CSDN,有需要的朋友可以下载

leagues_NBA_2014_games_games.csv

u.data

python_tweets.json

python_friends.json

第9章 作者归属问题数据获取

因为gutenberg网页进行了调整,书本自带的获取数据的源码已无法使用,我根据最新的网页结构调整了下面的代码,将下面的代码段替换Chapter 9 Authorship Analysis.ipynb第二个cell的全部内容即可

import os
import copy
from time import sleep
import urllib.request

titles = {}


titles['burton'] = [4657, 2400, 5760, 6036, 7111, 8821,
                    18506, 4658, 5761, 6886, 7113]
titles['dickens'] = [24022, 1392, 1414, 1467, 2324, 580,
                     786, 888, 963, 27924, 1394, 1415, 15618,
                     25985, 588, 807, 914, 967, 30127, 1400,
                     1421, 16023, 28198, 644, 809, 917, 968, 1023,
                     1406, 1422, 17879, 30368, 675, 810, 924, 98,
                     1289, 1413, 1423, 17880, 32241, 699, 821, 927]
titles['doyle'] = [2349, 11656, 1644, 22357, 2347, 290, 34627, 5148,
                   8394, 26153, 12555, 1661, 23059, 2348, 294, 355,
                   5260, 8727, 10446, 126, 17398, 2343, 2350, 3070,
                   356, 5317, 903, 10581, 13152, 2038, 2344, 244, 32536,
                   423, 537, 108, 139, 2097, 2345, 24951, 32777, 4295,
                   7964, 11413, 1638, 21768, 2346, 2845, 3289, 439, 834]
titles['gaboriau'] = [1748, 1651, 2736, 3336, 4604, 4002, 2451,
                      305, 3802, 547]
titles['nesbit'] = [34219, 23661, 28804, 4378, 778, 20404, 28725,
                    33028, 4513, 794]
titles['tarkington'] = [1098, 15855, 1983, 297, 402, 5798,
                        8740, 980, 1158, 1611, 2326, 30092,
                        483, 5949, 8867, 13275, 18259, 2595,
                        3428, 5756, 6401, 9659]
titles['twain'] = [1044, 1213, 245, 30092, 3176, 3179, 3183, 3189, 74,
                   86, 1086, 142, 2572, 3173, 3177, 3180, 3186, 3192,
                   76, 91, 119, 1837, 2895, 3174, 3178, 3181, 3187, 3432,
                   8525]



assert len(titles) == 7

assert len(titles['tarkington']) == 22
assert len(titles['dickens']) == 44
assert len(titles['nesbit']) == 10
assert len(titles['doyle']) == 51
assert len(titles['twain']) == 29
assert len(titles['burton']) == 11
assert len(titles['gaboriau']) == 10


url_base= "http://www.gutenberg.org/files/"
url_format = "{url_base}/{id}/{id}.txt"
url_format_0 = "{url_base}/{id}/{id}-0.txt"
url_format_8 = "{url_base}/{id}/{id}-8.txt"


# Make parent folder if not exists
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    
faileddownload = {}
faileddownload_final = {}
def getthebook(url, filename, bookid, deal_failed = False):
    print(" - Getting book with id {id}".format(id=bookid))
    print(" - " + url)
    
    if os.path.exists(filename):
        print(" - File already exists, skipping")
        return
    
    try:
        urllib.request.urlretrieve(url, filename)
        if deal_failed:
            del faileddownload_final[bookid]
        sleep(60 * 5)
    except Exception as err: 
        faileddownload[bookid] = (url,filename)
        
for author in titles:
    print("Downloading titles from {author}".format(author=author))
    # Make author's folder if not exists
    author_folder = os.path.join(data_folder, author)
    if not os.path.exists(author_folder):
        os.makedirs(author_folder)
    # Download each title to this folder
    for bookid in titles[author]:
        url = url_format.format(url_base=url_base, id=bookid)
        filename = os.path.join(author_folder, "{id}.txt".format(id=bookid))
        getthebook(url,filename,bookid)
        

faileddownload_final = copy.deepcopy(faileddownload)
for bookid in faileddownload:
    url = url_format_0.format(url_base=url_base, id=bookid)
    filename = faileddownload[bookid][1]
    getthebook(url,filename,bookid,True)


faileddownload = copy.deepcopy(faileddownload_final)
for bookid in faileddownload:
    url = url_format_8.format(url_base=url_base, id=bookid)
    filename = faileddownload[bookid][1]
    getthebook(url,filename,bookid,True)

print("download failed: ",faileddownload_final)
        
print("Download complete")

总共170多个文件,为了避免反爬虫的限制,每5分钟下载一个,总共下载了十几个小时,因为文件比较大,就不做附件上传了,有需要的同学请使用上面的代码进行下载,如果嫌时间慢的话,可以使用其他现成的爬虫框架,配置一下突破反爬虫的限制即可。

暂无评论

注册用户登录后才能发表或者回复评论,请先登录 注册。