1.文本文件转 json 格式
读取 txt 文件中的 tweets 文本,将其转为 json 格式,可以打印输出,也可以提取详细信息
代码:
import json import os folderpath = r"D:\Twitter Data\Data\test" files = os.listdir(folderpath) os.chdir(folderpath) # get the first txt file tweets_data_path = files[0] # store json format file in this array tweets_data = [] tweets_file = open(tweets_data_path, "r") for line in tweets_file: try: tweet = json.loads(line) tweets_data.append(tweet) except: continue # print json format file with indentation print(json.dumps(tweets_data[0], indent=4))
输出:
{ "created_at": "Tue Jun 25 20:44:34 +0000 2019", "id": 1143621025550049280, "id_str": "1143621025550049280", "text": "Australia beat the Poms overnight \ud83d\ude01\ud83c\udfcf\ud83c\udde6\ud83c\uddfa\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f #AUSvENG #CmonAussie #CWC19", "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>", "truncated": false, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": { "id": 252426781, "id_str": "252426781", "name": "Willy Aitch", "screen_name": "WillyAitch", "location": "Melbourne, Victoria", "url": null, "description": "September 2017 to February 2018, was the greatest 5 months ever. Richmond \ud83d\udc2f\ud83d\udc2f\ud83d\udc2fwon the 2017 AFL Premiership! Philadelphia Eagles \ud83e\udd85\ud83e\udd85\ud83e\udd85 won Super Bowl LII", "translator_type": "none", "protected": false, "verified": false, "followers_count": 417, "friends_count": 1061, "listed_count": 15, "favourites_count": 18852, "statuses_count": 17796, "created_at": "Tue Feb 15 04:55:59 +0000 2011", "utc_offset": null, "time_zone": null, "geo_enabled": true, "lang": null, "contributors_enabled": false, "is_translator": false, "profile_background_color": "C0DEED", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_link_color": "1DA1F2", "profile_sidebar_border_color": "C0DEED", "profile_sidebar_fill_color": "DDEEF6", "profile_text_color": "333333", "profile_use_background_image": true, "profile_image_url": "http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg", "profile_banner_url": "https://pbs.twimg.com/profile_banners/252426781/1522377977", "default_profile": true, "default_profile_image": false, "following": null, "follow_request_sent": null, "notifications": null }, "geo": null, "coordinates": null, "place": { "id": "01864a8a64df9dc4", "url": "https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json", "place_type": "city", "name": "Melbourne", "full_name": "Melbourne, Victoria", "country_code": "AU", "country": "Australia", "bounding_box": { "type": "Polygon", "coordinates": [ [ [ 144.593742, -38.433859 ], [ 144.593742, -37.511274 ], [ 145.512529, -37.511274 ], [ 145.512529, -38.433859 ] ] ] }, "attributes": {} }, "contributors": null, "is_quote_status": false, "quote_count": 0, "reply_count": 0, "retweet_count": 0, "favorite_count": 0, "entities": { "hashtags": [ { "text": "AUSvENG", "indices": [ 46, 54 ] }, { "text": "CmonAussie", "indices": [ 55, 66 ] }, { "text": "CWC19", "indices": [ 67, 73 ] } ], "urls": [], "user_mentions": [], "symbols": [] }, "favorited": false, "retweeted": false, "filter_level": "low", "lang": "en", "timestamp_ms": "1561495474599" }