【473】Twitter数据处理总结

一、数据收集

　　数据收集通过 Twitter API，搜集 US 境内全部 Twitter 数据，以 JSON 格式存储在 txt 文件中。

二、数据读取

　　从 txt 文件中，以 JSON 格式去获取每条 tweet 的信息，然后存储于 csv 文件中。读取时候的编码选的是 gbk。

　　代码如下：

from math import radians, sin
import json, os, codecs

# area of bounding box
def area(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    r = 6372
    return abs(r**2 * (lon2 - lon1) * (sin(lat2) - sin(lat1)))

# tweets of txt to csv
def txt2csv(foldername, filename):
    files = os.listdir(foldername)
    os.chdir(foldername)

    fo = open(filename, "w")
#    fo.write("\ufeff")
    fo.write("id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," + 
             "user_location,place_type,place_name," + 
             "place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," + 
             "min_lon,min_lat,max_lon,max_lat,bb_area,lang,source,text")
    count = 0

    for file in files:
        # determine is file or directory
        if os.path.isdir(file):
            continue

        count += 1
        print(count, ":", file)

        tweets_file = open(file, "r")
        for line in tweets_file:
            try:
                tweet = json.loads(line)
                csv_text = "\n"
                # id
                csv_text += tweet["id_str"]
                csv_text += ","
                # created_at
                csv_text += str(tweet["created_at"])
                csv_text += ","
                # coordinates
                if (tweet["coordinates"]):
                    csv_text += "Yes,"
                    csv_text += str(tweet["coordinates"]["coordinates"][0])
                    csv_text += ","
                    csv_text += str(tweet["coordinates"]["coordinates"][1])
                else:
                    csv_text += "None,None,None"
                csv_text += ","
                # geo
                if (tweet["geo"]):
                    csv_text += "Yes,"
                    csv_text += str(tweet["geo"]["coordinates"][0])
                    csv_text += ","
                    csv_text += str(tweet["geo"]["coordinates"][1])
                else:
                    csv_text += "None,None,None"
                csv_text += ","
                # user->location
                ul = str(tweet["user"]["location"])
                ul = ul.replace("\n", " ")
                ul = ul.replace("\"", "")
                ul = ul.replace("\'", "")
                csv_text += "\"" + ul + "\""
                csv_text += ","
                # place->type
                csv_text += str(tweet["place"]["place_type"])
                csv_text += ","
                # place->name
                csv_text += "\"" + str(tweet["place"]["name"]) + "\""
                csv_text += ","
                # place->full_name
                csv_text += "\"" + str(tweet["place"]["full_name"]) + "\""
                csv_text += ","
                # place->country
                csv_text += "\"" + str(tweet["place"]["country"]) + "\""
                csv_text += ","
                # place->bounding_box
                if (tweet["place"]["bounding_box"]["coordinates"]):
                    # min_lon
                    min_lon = tweet["place"]["bounding_box"]["coordinates"][0][0][0]
                    # min_lat
                    min_lat = tweet["place"]["bounding_box"]["coordinates"][0][0][1]
                    # max_lon
                    max_lon = tweet["place"]["bounding_box"]["coordinates"][0][2][0]
                    # max_lat
                    max_lat = tweet["place"]["bounding_box"]["coordinates"][0][2][1]
                    # avg of lon and lat
                    lon = (min_lon + max_lon)/2
                    lat = (min_lat + max_lat)/2
                    # area of bounding box
                    area_bb = area(min_lon, min_lat, max_lon, max_lat)
                    csv_text += "Yes,"
                    csv_text += str(lon)
                    csv_text += ","
                    csv_text += str(lat)
                    csv_text += ","
                    csv_text += str(min_lon)
                    csv_text += ","
                    csv_text += str(min_lat)
                    csv_text += ","
                    csv_text += str(max_lon)
                    csv_text += ","
                    csv_text += str(max_lat)
                    csv_text += ","
                    csv_text += str(area_bb)
                else:
                    csv_text += "None, None, None"
                csv_text += ","
                # lang
                csv_text += str(tweet["lang"])
                csv_text += ","
                # source
                csv_text += "\"" + str(tweet["source"]) + "\""
                csv_text += ","
                # text
                # replace carriage return, double quotation marks, single quotation marks with space or nothing
                text = str(tweet["text"])
                text = text.replace("\r", " ")
                text = text.replace("\n", " ")
                text = text.replace("\"", "")
                text = text.replace("\'", "")
                csv_text += "\"" + text + "\""
                fo.write(csv_text)

            except:
                continue

    fo.close()    
    
txt2csv(r"E:\USA\test", r"D:\OneDrive - UNSW\01-UNSW\02-Papers_Plan\02-CCIS\04-US_Tweets\tt.csv")

import pandas as pd
df = pd.read_csv(r"D:\OneDrive - UNSW\01-UNSW\02-Papers_Plan\02-CCIS\04-US_Tweets\tt.csv", encoding='gbk')
df.head()

　　数据的显示效果如下：

　　一共是 24 列，分别存储与时间和地点相关的信息，包括创建时间、经纬度、text 信息等。

三、数据处理

3.1 获取 tweets 总数量

　　实现起来很简单，还要计算出有多少列就行。

　　代码如下：

import pandas as pd
df = pd.read_csv(r"D:\OneDrive - UNSW\01-UNSW\02-Papers_Plan\02-CCIS\04-US_Tweets\tt.csv", encoding='gbk')

# 数据量
df.shape

　　结果类似 (715, 24)，说明有 715 条记录。

3.2 获取不重复 tweets 总数量

　　由于在收集的过程中可能重复提取，因此需要进行删除重复数据

　　代码如下：

# delete duplicate tweets
df = df.drop_duplicates(['id'])

# 无重复数据量
df.shape

　　显示结果如上

3.3 修改某些列的数据类型

　　默认的很多列都是 object 类型，为了进行计算需要进行修改，例如时间的列修改成 datetime 类型，经纬度为 float 等。

　　代码如下：

# change data type to datetime
# co_lon and co_lat are NONE sometimes
df = df.astype({"created_at":"datetime64[ns]"})

　　修改之后可以提取其中的年与日信息。

3.4 获取 tweets 的来源

　　主要是查询是 web 还是 iPhone、Android、Instagram 等。

　　代码如下：

# get total number of every source
print("\nsource counts:\n\n", df.source.value_counts())

　　会将不同来源的数量按大到小打印出来。

3.5 获取 geo-tagged tweets 数量

　　获取带有地理信息的 tweets 数量。

　　代码如下：

# get total number of tweets with goe-tags
print("\nGeotagged tweets counts:\n\n", df.coordinates.value_counts())

3.6 获取位于 US 境内并且为 ENG 的 tweets 数量

　　代码如下：

# get tweets from Aus
df = df[df['place_country'] == 'United States']

# get English tweets
df = df[df['lang'] == 'en']

df.shape
print("\n US English tweets count: ", df.shape[0])