#添加必要的库
import pandas as pd
import numpy as np
import requests as r
import os
import tweepy as tp
import json
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
#读取相关文件
dfTwitter_Archive = pd.read_csv('twitter-archive-enhanced.csv')
#从网络上下载文件
file_path = r'https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv'
response = r.get(file_path)
with open(file_path.split('/')[-1],mode='wb') as file:
file.write(response.content)
dfImage_Predictions = pd.read_csv('image-predictions.tsv',sep='\t')
#从推特下载其他数据 - 因无法访问推特所以直接下载数据
#consumer_key = 'YOUR CONSUMER KEY'
#consumer_secret = 'YOUR CONSUMER SECRET'
#access_token = 'YOUR ACCESS TOKEN'
#access_secret = 'YOUR ACCESS SECRET'
#auth = tp.OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)
#api = tp.API(auth)
#打印其他用户主页上的时间轴里的内容
#public_tweets = api.user_timeline('WeRateDogs')
#for tweet in public_tweets:
# print(tweet.text)
#直接读取json
json_file_path = r'tweet_json.txt'
dfOrigin = pd.read_json(json_file_path,lines=True)
dfJson_Tweeter = dfOrigin[['id','retweet_count','favorite_count']]
#dfOrigin[dfOrigin['id']!=dfOrigin['id_str']].shape[0]
dfJson_Tweeter = dfJson_Tweeter.rename(columns={'id':'tweet_id'})
dfTwitter_Archive
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
5 | 891087950875897856 | NaN | NaN | 2017-07-29 00:08:17 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a majestic great white breaching ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891087950... | 13 | 10 | None | None | None | None | None |
6 | 890971913173991426 | NaN | NaN | 2017-07-28 16:27:12 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Jax. He enjoys ice cream so much he gets ... | NaN | NaN | NaN | https://gofundme.com/ydvmve-surgery-for-jax,ht... | 13 | 10 | Jax | None | None | None | None |
7 | 890729181411237888 | NaN | NaN | 2017-07-28 00:22:40 +0000 | <a href="http://twitter.com/download/iphone" r... | When you watch your owner call another dog a g... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890729181... | 13 | 10 | None | None | None | None | None |
8 | 890609185150312448 | NaN | NaN | 2017-07-27 16:25:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Zoey. She doesn't want to be one of th... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890609185... | 13 | 10 | Zoey | None | None | None | None |
9 | 890240255349198849 | NaN | NaN | 2017-07-26 15:59:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Cassie. She is a college pup. Studying... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890240255... | 14 | 10 | Cassie | doggo | None | None | None |
10 | 890006608113172480 | NaN | NaN | 2017-07-26 00:31:25 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Koda. He is a South Australian decksha... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/890006608... | 13 | 10 | Koda | None | None | None | None |
11 | 889880896479866881 | NaN | NaN | 2017-07-25 16:11:53 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Bruno. He is a service shark. Only get... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889880896... | 13 | 10 | Bruno | None | None | None | None |
12 | 889665388333682689 | NaN | NaN | 2017-07-25 01:55:32 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a puppo that seems to be on the fence a... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889665388... | 13 | 10 | None | None | None | None | puppo |
13 | 889638837579907072 | NaN | NaN | 2017-07-25 00:10:02 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Ted. He does his best. Sometimes that'... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889638837... | 12 | 10 | Ted | None | None | None | None |
14 | 889531135344209921 | NaN | NaN | 2017-07-24 17:02:04 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Stuart. He's sporting his favorite fan... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889531135... | 13 | 10 | Stuart | None | None | None | puppo |
15 | 889278841981685760 | NaN | NaN | 2017-07-24 00:19:32 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Oliver. You're witnessing one of his m... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/889278841... | 13 | 10 | Oliver | None | None | None | None |
16 | 888917238123831296 | NaN | NaN | 2017-07-23 00:22:39 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Jim. He found a fren. Taught him how t... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/888917238... | 12 | 10 | Jim | None | None | None | None |
17 | 888804989199671297 | NaN | NaN | 2017-07-22 16:56:37 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Zeke. He has a new stick. Very proud o... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/888804989... | 13 | 10 | Zeke | None | None | None | None |
18 | 888554962724278272 | NaN | NaN | 2017-07-22 00:23:06 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Ralphus. He's powering up. Attempting ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/888554962... | 13 | 10 | Ralphus | None | None | None | None |
19 | 888202515573088257 | NaN | NaN | 2017-07-21 01:02:36 +0000 | <a href="http://twitter.com/download/iphone" r... | RT @dog_rates: This is Canela. She attempted s... | 8.874740e+17 | 4.196984e+09 | 2017-07-19 00:47:34 +0000 | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
20 | 888078434458587136 | NaN | NaN | 2017-07-20 16:49:33 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Gerald. He was just told he didn't get... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/888078434... | 12 | 10 | Gerald | None | None | None | None |
21 | 887705289381826560 | NaN | NaN | 2017-07-19 16:06:48 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Jeffrey. He has a monopoly on the pool... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887705289... | 13 | 10 | Jeffrey | None | None | None | None |
22 | 887517139158093824 | NaN | NaN | 2017-07-19 03:39:09 +0000 | <a href="http://twitter.com/download/iphone" r... | I've yet to rate a Venezuelan Hover Wiener. Th... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887517139... | 14 | 10 | such | None | None | None | None |
23 | 887473957103951883 | NaN | NaN | 2017-07-19 00:47:34 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Canela. She attempted some fancy porch... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887473957... | 13 | 10 | Canela | None | None | None | None |
24 | 887343217045368832 | NaN | NaN | 2017-07-18 16:08:03 +0000 | <a href="http://twitter.com/download/iphone" r... | You may not have known you needed to see this ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887343217... | 13 | 10 | None | None | None | None | None |
25 | 887101392804085760 | NaN | NaN | 2017-07-18 00:07:08 +0000 | <a href="http://twitter.com/download/iphone" r... | This... is a Jubilant Antarctic House Bear. We... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/887101392... | 12 | 10 | None | None | None | None | None |
26 | 886983233522544640 | NaN | NaN | 2017-07-17 16:17:36 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Maya. She's very shy. Rarely leaves he... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/886983233... | 13 | 10 | Maya | None | None | None | None |
27 | 886736880519319552 | NaN | NaN | 2017-07-16 23:58:41 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Mingus. He's a wonderful father to his... | NaN | NaN | NaN | https://www.gofundme.com/mingusneedsus,https:/... | 13 | 10 | Mingus | None | None | None | None |
28 | 886680336477933568 | NaN | NaN | 2017-07-16 20:14:00 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Derek. He's late for a dog meeting. 13... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/886680336... | 13 | 10 | Derek | None | None | None | None |
29 | 886366144734445568 | NaN | NaN | 2017-07-15 23:25:31 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Roscoe. Another pupper fallen victim t... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/886366144... | 12 | 10 | Roscoe | None | None | pupper | None |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2326 | 666411507551481857 | NaN | NaN | 2015-11-17 00:24:19 +0000 | <a href="http://twitter.com/download/iphone" r... | This is quite the dog. Gets really excited whe... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666411507... | 2 | 10 | quite | None | None | None | None |
2327 | 666407126856765440 | NaN | NaN | 2015-11-17 00:06:54 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a southern Vesuvius bumblegruff. Can d... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666407126... | 7 | 10 | a | None | None | None | None |
2328 | 666396247373291520 | NaN | NaN | 2015-11-16 23:23:41 +0000 | <a href="http://twitter.com/download/iphone" r... | Oh goodness. A super rare northeast Qdoba kang... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666396247... | 9 | 10 | None | None | None | None | None |
2329 | 666373753744588802 | NaN | NaN | 2015-11-16 21:54:18 +0000 | <a href="http://twitter.com/download/iphone" r... | Those are sunglasses and a jean jacket. 11/10 ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666373753... | 11 | 10 | None | None | None | None | None |
2330 | 666362758909284353 | NaN | NaN | 2015-11-16 21:10:36 +0000 | <a href="http://twitter.com/download/iphone" r... | Unique dog here. Very small. Lives in containe... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666362758... | 6 | 10 | None | None | None | None | None |
2331 | 666353288456101888 | NaN | NaN | 2015-11-16 20:32:58 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a mixed Asiago from the Galápagos... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666353288... | 8 | 10 | None | None | None | None | None |
2332 | 666345417576210432 | NaN | NaN | 2015-11-16 20:01:42 +0000 | <a href="http://twitter.com/download/iphone" r... | Look at this jokester thinking seat belt laws ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666345417... | 10 | 10 | None | None | None | None | None |
2333 | 666337882303524864 | NaN | NaN | 2015-11-16 19:31:45 +0000 | <a href="http://twitter.com/download/iphone" r... | This is an extremely rare horned Parthenon. No... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666337882... | 9 | 10 | an | None | None | None | None |
2334 | 666293911632134144 | NaN | NaN | 2015-11-16 16:37:02 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a funny dog. Weird toes. Won't come do... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666293911... | 3 | 10 | a | None | None | None | None |
2335 | 666287406224695296 | NaN | NaN | 2015-11-16 16:11:11 +0000 | <a href="http://twitter.com/download/iphone" r... | This is an Albanian 3 1/2 legged Episcopalian... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666287406... | 1 | 2 | an | None | None | None | None |
2336 | 666273097616637952 | NaN | NaN | 2015-11-16 15:14:19 +0000 | <a href="http://twitter.com/download/iphone" r... | Can take selfies 11/10 https://t.co/ws2AMaNwPW | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666273097... | 11 | 10 | None | None | None | None | None |
2337 | 666268910803644416 | NaN | NaN | 2015-11-16 14:57:41 +0000 | <a href="http://twitter.com/download/iphone" r... | Very concerned about fellow dog trapped in com... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666268910... | 10 | 10 | None | None | None | None | None |
2338 | 666104133288665088 | NaN | NaN | 2015-11-16 04:02:55 +0000 | <a href="http://twitter.com/download/iphone" r... | Not familiar with this breed. No tail (weird).... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666104133... | 1 | 10 | None | None | None | None | None |
2339 | 666102155909144576 | NaN | NaN | 2015-11-16 03:55:04 +0000 | <a href="http://twitter.com/download/iphone" r... | Oh my. Here you are seeing an Adobe Setter giv... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666102155... | 11 | 10 | None | None | None | None | None |
2340 | 666099513787052032 | NaN | NaN | 2015-11-16 03:44:34 +0000 | <a href="http://twitter.com/download/iphone" r... | Can stand on stump for what seems like a while... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666099513... | 8 | 10 | None | None | None | None | None |
2341 | 666094000022159362 | NaN | NaN | 2015-11-16 03:22:39 +0000 | <a href="http://twitter.com/download/iphone" r... | This appears to be a Mongolian Presbyterian mi... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666094000... | 9 | 10 | None | None | None | None | None |
2342 | 666082916733198337 | NaN | NaN | 2015-11-16 02:38:37 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a well-established sunblockerspan... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666082916... | 6 | 10 | None | None | None | None | None |
2343 | 666073100786774016 | NaN | NaN | 2015-11-16 01:59:36 +0000 | <a href="http://twitter.com/download/iphone" r... | Let's hope this flight isn't Malaysian (lol). ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666073100... | 10 | 10 | None | None | None | None | None |
2344 | 666071193221509120 | NaN | NaN | 2015-11-16 01:52:02 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a northern speckled Rhododendron.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666071193... | 9 | 10 | None | None | None | None | None |
2345 | 666063827256086533 | NaN | NaN | 2015-11-16 01:22:45 +0000 | <a href="http://twitter.com/download/iphone" r... | This is the happiest dog you will ever see. Ve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666063827... | 10 | 10 | the | None | None | None | None |
2346 | 666058600524156928 | NaN | NaN | 2015-11-16 01:01:59 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is the Rand Paul of retrievers folks! He'... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666058600... | 8 | 10 | the | None | None | None | None |
2347 | 666057090499244032 | NaN | NaN | 2015-11-16 00:55:59 +0000 | <a href="http://twitter.com/download/iphone" r... | My oh my. This is a rare blond Canadian terrie... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666057090... | 9 | 10 | a | None | None | None | None |
2348 | 666055525042405380 | NaN | NaN | 2015-11-16 00:49:46 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a Siberian heavily armored polar bear ... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666055525... | 10 | 10 | a | None | None | None | None |
2349 | 666051853826850816 | NaN | NaN | 2015-11-16 00:35:11 +0000 | <a href="http://twitter.com/download/iphone" r... | This is an odd dog. Hard on the outside but lo... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666051853... | 2 | 10 | an | None | None | None | None |
2350 | 666050758794694657 | NaN | NaN | 2015-11-16 00:30:50 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a truly beautiful English Wilson Staff... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666050758... | 10 | 10 | a | None | None | None | None |
2351 | 666049248165822465 | NaN | NaN | 2015-11-16 00:24:50 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a 1949 1st generation vulpix. Enj... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666049248... | 5 | 10 | None | None | None | None | None |
2352 | 666044226329800704 | NaN | NaN | 2015-11-16 00:04:52 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a purebred Piers Morgan. Loves to Netf... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666044226... | 6 | 10 | a | None | None | None | None |
2353 | 666033412701032449 | NaN | NaN | 2015-11-15 23:21:54 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a very happy pup. Big fan of well-main... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666033412... | 9 | 10 | a | None | None | None | None |
2354 | 666029285002620928 | NaN | NaN | 2015-11-15 23:05:30 +0000 | <a href="http://twitter.com/download/iphone" r... | This is a western brown Mitsubishi terrier. Up... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666029285... | 7 | 10 | a | None | None | None | None |
2355 | 666020888022790149 | NaN | NaN | 2015-11-15 22:32:08 +0000 | <a href="http://twitter.com/download/iphone" r... | Here we have a Japanese Irish Setter. Lost eye... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/666020888... | 8 | 10 | None | None | None | None | None |
2356 rows × 17 columns
dfTwitter_Archive.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): tweet_id 2356 non-null int64 in_reply_to_status_id 78 non-null float64 in_reply_to_user_id 78 non-null float64 timestamp 2356 non-null object source 2356 non-null object text 2356 non-null object retweeted_status_id 181 non-null float64 retweeted_status_user_id 181 non-null float64 retweeted_status_timestamp 181 non-null object expanded_urls 2297 non-null object rating_numerator 2356 non-null int64 rating_denominator 2356 non-null int64 name 2356 non-null object doggo 2356 non-null object floofer 2356 non-null object pupper 2356 non-null object puppo 2356 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 313.0+ KB
#通过观察发现了rating_denominator的值并不全部为10,还有其他数字
dfTwitter_Archive['rating_denominator'].value_counts()
10 2333 11 3 50 3 80 2 20 2 2 1 16 1 40 1 70 1 15 1 90 1 110 1 120 1 130 1 150 1 170 1 7 1 0 1 Name: rating_denominator, dtype: int64
#通过观察发现了名字name列出现了诸如a,an,the的单词,这并不是正常的狗名字
dfTwitter_Archive['name'].value_counts().head(20)
None 745 a 55 Charlie 12 Oliver 11 Lucy 11 Cooper 11 Lola 10 Penny 10 Tucker 10 Winston 9 Bo 9 Sadie 8 the 8 an 7 Toby 7 Bailey 7 Daisy 7 Buddy 7 Dave 6 Scout 6 Name: name, dtype: int64
#狗的地位有缺失,有些甚至还有两种地位
(dfTwitter_Archive.iloc[:,-4:]=='None').astype(int).sum(axis=1).value_counts()
4 1976 3 366 2 14 dtype: int64
#狗的名字有大量缺失
(dfTwitter_Archive.loc[:,'name']=='None').astype(int).sum()
745
dfTwitter_Archive['tweet_id'].duplicated().sum()
0
dfImage_Predictions
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
5 | 666050758794694657 | https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg | 1 | Bernese_mountain_dog | 0.651137 | True | English_springer | 0.263788 | True | Greater_Swiss_Mountain_dog | 0.016199 | True |
6 | 666051853826850816 | https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg | 1 | box_turtle | 0.933012 | False | mud_turtle | 0.045885 | False | terrapin | 0.017885 | False |
7 | 666055525042405380 | https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg | 1 | chow | 0.692517 | True | Tibetan_mastiff | 0.058279 | True | fur_coat | 0.054449 | False |
8 | 666057090499244032 | https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg | 1 | shopping_cart | 0.962465 | False | shopping_basket | 0.014594 | False | golden_retriever | 0.007959 | True |
9 | 666058600524156928 | https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg | 1 | miniature_poodle | 0.201493 | True | komondor | 0.192305 | True | soft-coated_wheaten_terrier | 0.082086 | True |
10 | 666063827256086533 | https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg | 1 | golden_retriever | 0.775930 | True | Tibetan_mastiff | 0.093718 | True | Labrador_retriever | 0.072427 | True |
11 | 666071193221509120 | https://pbs.twimg.com/media/CT5cN_3WEAAlOoZ.jpg | 1 | Gordon_setter | 0.503672 | True | Yorkshire_terrier | 0.174201 | True | Pekinese | 0.109454 | True |
12 | 666073100786774016 | https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg | 1 | Walker_hound | 0.260857 | True | English_foxhound | 0.175382 | True | Ibizan_hound | 0.097471 | True |
13 | 666082916733198337 | https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg | 1 | pug | 0.489814 | True | bull_mastiff | 0.404722 | True | French_bulldog | 0.048960 | True |
14 | 666094000022159362 | https://pbs.twimg.com/media/CT5w9gUW4AAsBNN.jpg | 1 | bloodhound | 0.195217 | True | German_shepherd | 0.078260 | True | malinois | 0.075628 | True |
15 | 666099513787052032 | https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg | 1 | Lhasa | 0.582330 | True | Shih-Tzu | 0.166192 | True | Dandie_Dinmont | 0.089688 | True |
16 | 666102155909144576 | https://pbs.twimg.com/media/CT54YGiWUAEZnoK.jpg | 1 | English_setter | 0.298617 | True | Newfoundland | 0.149842 | True | borzoi | 0.133649 | True |
17 | 666104133288665088 | https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg | 1 | hen | 0.965932 | False | cock | 0.033919 | False | partridge | 0.000052 | False |
18 | 666268910803644416 | https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg | 1 | desktop_computer | 0.086502 | False | desk | 0.085547 | False | bookcase | 0.079480 | False |
19 | 666273097616637952 | https://pbs.twimg.com/media/CT8T1mtUwAA3aqm.jpg | 1 | Italian_greyhound | 0.176053 | True | toy_terrier | 0.111884 | True | basenji | 0.111152 | True |
20 | 666287406224695296 | https://pbs.twimg.com/media/CT8g3BpUEAAuFjg.jpg | 1 | Maltese_dog | 0.857531 | True | toy_poodle | 0.063064 | True | miniature_poodle | 0.025581 | True |
21 | 666293911632134144 | https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg | 1 | three-toed_sloth | 0.914671 | False | otter | 0.015250 | False | great_grey_owl | 0.013207 | False |
22 | 666337882303524864 | https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg | 1 | ox | 0.416669 | False | Newfoundland | 0.278407 | True | groenendael | 0.102643 | True |
23 | 666345417576210432 | https://pbs.twimg.com/media/CT9Vn7PWoAA_ZCM.jpg | 1 | golden_retriever | 0.858744 | True | Chesapeake_Bay_retriever | 0.054787 | True | Labrador_retriever | 0.014241 | True |
24 | 666353288456101888 | https://pbs.twimg.com/media/CT9cx0tUEAAhNN_.jpg | 1 | malamute | 0.336874 | True | Siberian_husky | 0.147655 | True | Eskimo_dog | 0.093412 | True |
25 | 666362758909284353 | https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg | 1 | guinea_pig | 0.996496 | False | skunk | 0.002402 | False | hamster | 0.000461 | False |
26 | 666373753744588802 | https://pbs.twimg.com/media/CT9vZEYWUAAlZ05.jpg | 1 | soft-coated_wheaten_terrier | 0.326467 | True | Afghan_hound | 0.259551 | True | briard | 0.206803 | True |
27 | 666396247373291520 | https://pbs.twimg.com/media/CT-D2ZHWIAA3gK1.jpg | 1 | Chihuahua | 0.978108 | True | toy_terrier | 0.009397 | True | papillon | 0.004577 | True |
28 | 666407126856765440 | https://pbs.twimg.com/media/CT-NvwmW4AAugGZ.jpg | 1 | black-and-tan_coonhound | 0.529139 | True | bloodhound | 0.244220 | True | flat-coated_retriever | 0.173810 | True |
29 | 666411507551481857 | https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg | 1 | coho | 0.404640 | False | barracouta | 0.271485 | False | gar | 0.189945 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2045 | 886366144734445568 | https://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg | 1 | French_bulldog | 0.999201 | True | Chihuahua | 0.000361 | True | Boston_bull | 0.000076 | True |
2046 | 886680336477933568 | https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg | 1 | convertible | 0.738995 | False | sports_car | 0.139952 | False | car_wheel | 0.044173 | False |
2047 | 886736880519319552 | https://pbs.twimg.com/media/DE5Se8FXcAAJFx4.jpg | 1 | kuvasz | 0.309706 | True | Great_Pyrenees | 0.186136 | True | Dandie_Dinmont | 0.086346 | True |
2048 | 886983233522544640 | https://pbs.twimg.com/media/DE8yicJW0AAAvBJ.jpg | 2 | Chihuahua | 0.793469 | True | toy_terrier | 0.143528 | True | can_opener | 0.032253 | False |
2049 | 887101392804085760 | https://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg | 1 | Samoyed | 0.733942 | True | Eskimo_dog | 0.035029 | True | Staffordshire_bullterrier | 0.029705 | True |
2050 | 887343217045368832 | https://pbs.twimg.com/ext_tw_video_thumb/88734... | 1 | Mexican_hairless | 0.330741 | True | sea_lion | 0.275645 | False | Weimaraner | 0.134203 | True |
2051 | 887473957103951883 | https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg | 2 | Pembroke | 0.809197 | True | Rhodesian_ridgeback | 0.054950 | True | beagle | 0.038915 | True |
2052 | 887517139158093824 | https://pbs.twimg.com/ext_tw_video_thumb/88751... | 1 | limousine | 0.130432 | False | tow_truck | 0.029175 | False | shopping_cart | 0.026321 | False |
2053 | 887705289381826560 | https://pbs.twimg.com/media/DFHDQBbXgAEqY7t.jpg | 1 | basset | 0.821664 | True | redbone | 0.087582 | True | Weimaraner | 0.026236 | True |
2054 | 888078434458587136 | https://pbs.twimg.com/media/DFMWn56WsAAkA7B.jpg | 1 | French_bulldog | 0.995026 | True | pug | 0.000932 | True | bull_mastiff | 0.000903 | True |
2055 | 888202515573088257 | https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg | 2 | Pembroke | 0.809197 | True | Rhodesian_ridgeback | 0.054950 | True | beagle | 0.038915 | True |
2056 | 888554962724278272 | https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg | 3 | Siberian_husky | 0.700377 | True | Eskimo_dog | 0.166511 | True | malamute | 0.111411 | True |
2057 | 888804989199671297 | https://pbs.twimg.com/media/DFWra-3VYAA2piG.jpg | 1 | golden_retriever | 0.469760 | True | Labrador_retriever | 0.184172 | True | English_setter | 0.073482 | True |
2058 | 888917238123831296 | https://pbs.twimg.com/media/DFYRgsOUQAARGhO.jpg | 1 | golden_retriever | 0.714719 | True | Tibetan_mastiff | 0.120184 | True | Labrador_retriever | 0.105506 | True |
2059 | 889278841981685760 | https://pbs.twimg.com/ext_tw_video_thumb/88927... | 1 | whippet | 0.626152 | True | borzoi | 0.194742 | True | Saluki | 0.027351 | True |
2060 | 889531135344209921 | https://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg | 1 | golden_retriever | 0.953442 | True | Labrador_retriever | 0.013834 | True | redbone | 0.007958 | True |
2061 | 889638837579907072 | https://pbs.twimg.com/media/DFihzFfXsAYGDPR.jpg | 1 | French_bulldog | 0.991650 | True | boxer | 0.002129 | True | Staffordshire_bullterrier | 0.001498 | True |
2062 | 889665388333682689 | https://pbs.twimg.com/media/DFi579UWsAAatzw.jpg | 1 | Pembroke | 0.966327 | True | Cardigan | 0.027356 | True | basenji | 0.004633 | True |
2063 | 889880896479866881 | https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg | 1 | French_bulldog | 0.377417 | True | Labrador_retriever | 0.151317 | True | muzzle | 0.082981 | False |
2064 | 890006608113172480 | https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg | 1 | Samoyed | 0.957979 | True | Pomeranian | 0.013884 | True | chow | 0.008167 | True |
2065 | 890240255349198849 | https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg | 1 | Pembroke | 0.511319 | True | Cardigan | 0.451038 | True | Chihuahua | 0.029248 | True |
2066 | 890609185150312448 | https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg | 1 | Irish_terrier | 0.487574 | True | Irish_setter | 0.193054 | True | Chesapeake_Bay_retriever | 0.118184 | True |
2067 | 890729181411237888 | https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg | 2 | Pomeranian | 0.566142 | True | Eskimo_dog | 0.178406 | True | Pembroke | 0.076507 | True |
2068 | 890971913173991426 | https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg | 1 | Appenzeller | 0.341703 | True | Border_collie | 0.199287 | True | ice_lolly | 0.193548 | False |
2069 | 891087950875897856 | https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg | 1 | Chesapeake_Bay_retriever | 0.425595 | True | Irish_terrier | 0.116317 | True | Indian_elephant | 0.076902 | False |
2070 | 891327558926688256 | https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg | 2 | basset | 0.555712 | True | English_springer | 0.225770 | True | German_short-haired_pointer | 0.175219 | True |
2071 | 891689557279858688 | https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg | 1 | paper_towel | 0.170278 | False | Labrador_retriever | 0.168086 | True | spatula | 0.040836 | False |
2072 | 891815181378084864 | https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg | 1 | Chihuahua | 0.716012 | True | malamute | 0.078253 | True | kelpie | 0.031379 | True |
2073 | 892177421306343426 | https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg | 1 | Chihuahua | 0.323581 | True | Pekinese | 0.090647 | True | papillon | 0.068957 | True |
2074 | 892420643555336193 | https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg | 1 | orange | 0.097049 | False | bagel | 0.085851 | False | banana | 0.076110 | False |
2075 rows × 12 columns
dfImage_Predictions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): tweet_id 2075 non-null int64 jpg_url 2075 non-null object img_num 2075 non-null int64 p1 2075 non-null object p1_conf 2075 non-null float64 p1_dog 2075 non-null bool p2 2075 non-null object p2_conf 2075 non-null float64 p2_dog 2075 non-null bool p3 2075 non-null object p3_conf 2075 non-null float64 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(2), object(4) memory usage: 152.1+ KB
dfImage_Predictions['jpg_url'].duplicated().sum()
66
dfJson_Tweeter
tweet_id | retweet_count | favorite_count | |
---|---|---|---|
0 | 892420643555336193 | 8842 | 39492 |
1 | 892177421306343426 | 6480 | 33786 |
2 | 891815181378084864 | 4301 | 25445 |
3 | 891689557279858688 | 8925 | 42863 |
4 | 891327558926688256 | 9721 | 41016 |
5 | 891087950875897856 | 3240 | 20548 |
6 | 890971913173991426 | 2142 | 12053 |
7 | 890729181411237888 | 19548 | 66596 |
8 | 890609185150312448 | 4403 | 28187 |
9 | 890240255349198849 | 7684 | 32467 |
10 | 890006608113172480 | 7584 | 31127 |
11 | 889880896479866881 | 5116 | 28208 |
12 | 889665388333682689 | 8502 | 38745 |
13 | 889638837579907072 | 4705 | 27633 |
14 | 889531135344209921 | 2309 | 15329 |
15 | 889278841981685760 | 5635 | 25712 |
16 | 888917238123831296 | 4681 | 29555 |
17 | 888804989199671297 | 4535 | 26021 |
18 | 888554962724278272 | 3722 | 20267 |
19 | 888078434458587136 | 3637 | 22144 |
20 | 887705289381826560 | 5584 | 30690 |
21 | 887517139158093824 | 12053 | 46940 |
22 | 887473957103951883 | 18813 | 70007 |
23 | 887343217045368832 | 10713 | 34223 |
24 | 887101392804085760 | 6147 | 31045 |
25 | 886983233522544640 | 8045 | 35786 |
26 | 886736880519319552 | 3420 | 12286 |
27 | 886680336477933568 | 4597 | 22802 |
28 | 886366144734445568 | 3297 | 21488 |
29 | 886267009285017600 | 4 | 117 |
... | ... | ... | ... |
2322 | 666411507551481857 | 337 | 457 |
2323 | 666407126856765440 | 43 | 113 |
2324 | 666396247373291520 | 91 | 171 |
2325 | 666373753744588802 | 99 | 194 |
2326 | 666362758909284353 | 590 | 801 |
2327 | 666353288456101888 | 76 | 228 |
2328 | 666345417576210432 | 146 | 308 |
2329 | 666337882303524864 | 96 | 203 |
2330 | 666293911632134144 | 365 | 519 |
2331 | 666287406224695296 | 71 | 152 |
2332 | 666273097616637952 | 81 | 183 |
2333 | 666268910803644416 | 37 | 108 |
2334 | 666104133288665088 | 6835 | 14703 |
2335 | 666102155909144576 | 15 | 81 |
2336 | 666099513787052032 | 73 | 160 |
2337 | 666094000022159362 | 78 | 168 |
2338 | 666082916733198337 | 47 | 121 |
2339 | 666073100786774016 | 173 | 334 |
2340 | 666071193221509120 | 67 | 154 |
2341 | 666063827256086533 | 230 | 494 |
2342 | 666058600524156928 | 61 | 117 |
2343 | 666057090499244032 | 146 | 304 |
2344 | 666055525042405380 | 261 | 449 |
2345 | 666051853826850816 | 877 | 1250 |
2346 | 666050758794694657 | 60 | 136 |
2347 | 666049248165822465 | 41 | 111 |
2348 | 666044226329800704 | 147 | 309 |
2349 | 666033412701032449 | 47 | 128 |
2350 | 666029285002620928 | 48 | 132 |
2351 | 666020888022790149 | 530 | 2528 |
2352 rows × 3 columns
dfJson_Tweeter.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2352 entries, 0 to 2351 Data columns (total 3 columns): tweet_id 2352 non-null int64 retweet_count 2352 non-null int64 favorite_count 2352 non-null int64 dtypes: int64(3) memory usage: 73.5 KB
dfJson_Tweeter['tweet_id'].duplicated().sum()
0
#准备工作
dfTwitter_Archive_Clean = dfTwitter_Archive.copy()
dfImage_Predictions_Clean = dfImage_Predictions.copy()
dfJson_Tweeter_Clean = dfJson_Tweeter.copy()
对转发、回复的数据执行删除操作
dfTwitter_Archive_Clean = dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['retweeted_status_id'].isnull()]
dfTwitter_Archive_Clean = dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['in_reply_to_user_id'].isnull()]
#测试
dfTwitter_Archive_Clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2097 entries, 0 to 2355 Data columns (total 17 columns): tweet_id 2097 non-null int64 in_reply_to_status_id 0 non-null float64 in_reply_to_user_id 0 non-null float64 timestamp 2097 non-null object source 2097 non-null object text 2097 non-null object retweeted_status_id 0 non-null float64 retweeted_status_user_id 0 non-null float64 retweeted_status_timestamp 0 non-null object expanded_urls 2094 non-null object rating_numerator 2097 non-null int64 rating_denominator 2097 non-null int64 name 2097 non-null object doggo 2097 non-null object floofer 2097 non-null object pupper 2097 non-null object puppo 2097 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 294.9+ KB
#将多余列删除
dfTwitter_Archive_Clean.drop(['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'],axis=1,inplace=True)
dfTwitter_Archive_Clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2097 entries, 0 to 2355 Data columns (total 12 columns): tweet_id 2097 non-null int64 timestamp 2097 non-null object source 2097 non-null object text 2097 non-null object expanded_urls 2094 non-null object rating_numerator 2097 non-null int64 rating_denominator 2097 non-null int64 name 2097 non-null object doggo 2097 non-null object floofer 2097 non-null object pupper 2097 non-null object puppo 2097 non-null object dtypes: int64(3), object(9) memory usage: 213.0+ KB
将expanded_urls列缺失的数据删除
dfTwitter_Archive_Clean = dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['expanded_urls'].notnull()]
#测试
dfTwitter_Archive_Clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2094 entries, 0 to 2355 Data columns (total 12 columns): tweet_id 2094 non-null int64 timestamp 2094 non-null object source 2094 non-null object text 2094 non-null object expanded_urls 2094 non-null object rating_numerator 2094 non-null int64 rating_denominator 2094 non-null int64 name 2094 non-null object doggo 2094 non-null object floofer 2094 non-null object pupper 2094 non-null object puppo 2094 non-null object dtypes: int64(3), object(9) memory usage: 212.7+ KB
#查看原始推文
dfTwitter_Archive[(dfTwitter_Archive.iloc[:,-4:]=='None').astype(int).sum(axis=1)==4].sample(1).text
1649 Meet Sebastian. He's a womanizer. Romantic af.... Name: text, dtype: object
#合并原四列
dfTwitter_Archive_Clean['stage'] = dfTwitter_Archive_Clean['doggo']+dfTwitter_Archive_Clean['floofer']+dfTwitter_Archive_Clean['pupper']+dfTwitter_Archive_Clean['puppo']
dfTwitter_Archive_Clean['stage'] = dfTwitter_Archive_Clean['stage'].str.replace('None','')
dfTwitter_Archive_Clean = dfTwitter_Archive_Clean.replace(({'stage':{'':np.nan}}))
#删除四列
dfTwitter_Archive_Clean.drop(['doggo','floofer','pupper','puppo'],axis=1,inplace=True)
dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['stage'].notnull()]
tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | stage | |
---|---|---|---|---|---|---|---|---|---|
9 | 890240255349198849 | 2017-07-26 15:59:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Cassie. She is a college pup. Studying... | https://twitter.com/dog_rates/status/890240255... | 14 | 10 | Cassie | doggo |
12 | 889665388333682689 | 2017-07-25 01:55:32 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a puppo that seems to be on the fence a... | https://twitter.com/dog_rates/status/889665388... | 13 | 10 | None | puppo |
14 | 889531135344209921 | 2017-07-24 17:02:04 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Stuart. He's sporting his favorite fan... | https://twitter.com/dog_rates/status/889531135... | 13 | 10 | Stuart | puppo |
29 | 886366144734445568 | 2017-07-15 23:25:31 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Roscoe. Another pupper fallen victim t... | https://twitter.com/dog_rates/status/886366144... | 12 | 10 | Roscoe | pupper |
43 | 884162670584377345 | 2017-07-09 21:29:42 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Yogi. He doesn't have any important dog m... | https://twitter.com/dog_rates/status/884162670... | 12 | 10 | Yogi | doggo |
46 | 883360690899218434 | 2017-07-07 16:22:55 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Grizzwald. He may be the floofiest floofe... | https://twitter.com/dog_rates/status/883360690... | 13 | 10 | Grizzwald | floofer |
49 | 882762694511734784 | 2017-07-06 00:46:41 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Gus. He's quite the cheeky pupper. Alr... | https://twitter.com/dog_rates/status/882762694... | 12 | 10 | Gus | pupper |
56 | 881536004380872706 | 2017-07-02 15:32:16 +0000 | <a href="http://twitter.com/download/iphone" r... | Here is a pupper approaching maximum borkdrive... | https://twitter.com/dog_rates/status/881536004... | 14 | 10 | a | pupper |
71 | 878776093423087618 | 2017-06-25 00:45:22 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Snoopy. He's a proud #PrideMonthPuppo.... | https://twitter.com/dog_rates/status/878776093... | 13 | 10 | Snoopy | puppo |
82 | 876838120628539392 | 2017-06-19 16:24:33 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Ginger. She's having a ruff Monday. To... | https://twitter.com/dog_rates/status/876838120... | 12 | 10 | Ginger | pupper |
92 | 874296783580663808 | 2017-06-12 16:06:11 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Jed. He may be the fanciest pupper in ... | https://twitter.com/dog_rates/status/874296783... | 13 | 10 | Jed | pupper |
94 | 874012996292530176 | 2017-06-11 21:18:31 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Sebastian. He can't see all the colors... | https://twitter.com/dog_rates/status/874012996... | 13 | 10 | Sebastian | puppo |
98 | 873213775632977920 | 2017-06-09 16:22:42 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Sierra. She's one precious pupper. Abs... | https://www.gofundme.com/help-my-baby-sierra-g... | 12 | 10 | Sierra | pupper |
99 | 872967104147763200 | 2017-06-09 00:02:31 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a very large dog. He has a date later. ... | https://twitter.com/dog_rates/status/872967104... | 12 | 10 | None | doggo |
107 | 871762521631449091 | 2017-06-05 16:15:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Rover. As part of pupper protocol he h... | https://twitter.com/dog_rates/status/871762521... | 12 | 10 | Rover | pupper |
108 | 871515927908634625 | 2017-06-04 23:56:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Napolean. He's a Raggedy East Nicaragu... | https://twitter.com/dog_rates/status/871515927... | 12 | 10 | Napolean | doggo |
110 | 871102520638267392 | 2017-06-03 20:33:19 +0000 | <a href="http://twitter.com/download/iphone" r... | Never doubt a doggo 14/10 https://t.co/AbBLh2FZCH | https://twitter.com/animalcog/status/871075758... | 14 | 10 | None | doggo |
121 | 869596645499047938 | 2017-05-30 16:49:31 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Scout. He just graduated. Officially a... | https://twitter.com/dog_rates/status/869596645... | 12 | 10 | Scout | doggo |
129 | 867421006826221569 | 2017-05-24 16:44:18 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Shikha. She just watched you drop a sk... | https://twitter.com/dog_rates/status/867421006... | 12 | 10 | Shikha | puppo |
135 | 866450705531457537 | 2017-05-22 00:28:40 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Jamesy. He gives a kiss to every other... | https://twitter.com/dog_rates/status/866450705... | 13 | 10 | Jamesy | pupper |
168 | 859607811541651456 | 2017-05-03 03:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | Sorry for the lack of posts today. I came home... | https://twitter.com/dog_rates/status/859607811... | 13 | 10 | None | puppo |
172 | 858843525470990336 | 2017-05-01 00:40:27 +0000 | <a href="http://twitter.com/download/iphone" r... | I have stumbled puppon a doggo painting party.... | https://twitter.com/dog_rates/status/858843525... | 13 | 10 | None | doggo |
191 | 855851453814013952 | 2017-04-22 18:31:02 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a puppo participating in the #ScienceMa... | https://twitter.com/dog_rates/status/855851453... | 13 | 10 | None | doggopuppo |
199 | 854120357044912130 | 2017-04-17 23:52:16 +0000 | <a href="http://twitter.com/download/iphone" r... | Sometimes you guys remind me just how impactfu... | https://twitter.com/dog_rates/status/854120357... | 14 | 10 | None | pupper |
200 | 854010172552949760 | 2017-04-17 16:34:26 +0000 | <a href="http://twitter.com/download/iphone" r... | At first I thought this was a shy doggo, but i... | https://twitter.com/dog_rates/status/854010172... | 11 | 10 | None | doggofloofer |
220 | 850019790995546112 | 2017-04-06 16:18:05 +0000 | <a href="http://twitter.com/download/iphone" r... | Say hello to Boomer. He's a sandy pupper. Havi... | https://twitter.com/dog_rates/status/850019790... | 12 | 10 | Boomer | pupper |
240 | 846514051647705089 | 2017-03-28 00:07:32 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Barney. He's an elder doggo. Hitches a... | https://twitter.com/dog_rates/status/846514051... | 13 | 10 | Barney | doggo |
248 | 845397057150107648 | 2017-03-24 22:08:59 +0000 | <a href="http://twitter.com/download/iphone" r... | Say hello to Mimosa. She's an emotional suppor... | https://www.gofundme.com/help-save-a-pup,https... | 13 | 10 | Mimosa | doggo |
249 | 845306882940190720 | 2017-03-24 16:10:40 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Pickles. She's a silly pupper. Thinks ... | https://twitter.com/dog_rates/status/845306882... | 12 | 10 | Pickles | pupper |
293 | 837820167694528512 | 2017-03-04 00:21:08 +0000 | <a href="http://twitter.com/download/iphone" r... | Here's a pupper before and after being asked "... | https://twitter.com/dog_rates/status/837820167... | 12 | 10 | None | pupper |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1875 | 675113801096802304 | 2015-12-11 00:44:07 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Zuzu. He just graduated college. Astute p... | https://twitter.com/dog_rates/status/675113801... | 10 | 10 | Zuzu | pupper |
1880 | 675006312288268288 | 2015-12-10 17:37:00 +0000 | <a href="http://twitter.com/download/iphone" r... | Say hello to Mollie. This pic was taken after ... | https://twitter.com/dog_rates/status/675006312... | 10 | 10 | Mollie | pupper |
1889 | 674774481756377088 | 2015-12-10 02:15:47 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Superpup. His head isn't proportional ... | https://twitter.com/dog_rates/status/674774481... | 11 | 10 | Superpup | pupper |
1897 | 674737130913071104 | 2015-12-09 23:47:22 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Rufio. He is unaware of the pink legless ... | https://twitter.com/dog_rates/status/674737130... | 10 | 10 | Rufio | pupper |
1903 | 674638615994089473 | 2015-12-09 17:15:54 +0000 | <a href="http://twitter.com/download/iphone" r... | This pupper is fed up with being tickled. 12/1... | https://twitter.com/dog_rates/status/674638615... | 12 | 10 | None | pupper |
1907 | 674447403907457024 | 2015-12-09 04:36:06 +0000 | <a href="http://twitter.com/download/iphone" r... | This pupper just wants a belly rub. This puppe... | https://twitter.com/dog_rates/status/674447403... | 10 | 10 | None | pupper |
1915 | 674318007229923329 | 2015-12-08 20:01:55 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Lennon. He's in quite the predicament.... | https://twitter.com/dog_rates/status/674318007... | 8 | 10 | Lennon | pupper |
1921 | 674262580978937856 | 2015-12-08 16:21:41 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Gus. He's super stoked about being an ... | https://twitter.com/dog_rates/status/674262580... | 9 | 10 | Gus | pupper |
1930 | 674038233588723717 | 2015-12-08 01:30:12 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Kaiya. She's an aspiring shoe model. 1... | https://twitter.com/dog_rates/status/674038233... | 12 | 10 | Kaiya | pupper |
1936 | 673956914389192708 | 2015-12-07 20:07:04 +0000 | <a href="http://twitter.com/download/iphone" r... | This is one esteemed pupper. Just graduated co... | https://twitter.com/dog_rates/status/673956914... | 10 | 10 | one | pupper |
1937 | 673919437611909120 | 2015-12-07 17:38:09 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Obie. He is on guard watching for evil... | https://twitter.com/dog_rates/status/673919437... | 11 | 10 | Obie | pupper |
1945 | 673707060090052608 | 2015-12-07 03:34:14 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Raymond. He's absolutely terrified of ... | https://twitter.com/dog_rates/status/673707060... | 10 | 10 | Raymond | pupper |
1948 | 673697980713705472 | 2015-12-07 02:58:09 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Pickles. She's a tiny pointy pupper. A... | https://twitter.com/dog_rates/status/673697980... | 8 | 10 | Pickles | pupper |
1954 | 673656262056419329 | 2015-12-07 00:12:23 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Albert AKA King Banana Peel. He's a ki... | https://twitter.com/dog_rates/status/673656262... | 10 | 10 | Albert | pupper |
1956 | 673612854080196609 | 2015-12-06 21:19:54 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Jeffri. He's a speckled ice pupper. Ve... | https://twitter.com/dog_rates/status/673612854... | 7 | 10 | Jeffri | pupper |
1960 | 673363615379013632 | 2015-12-06 04:49:31 +0000 | <a href="http://twitter.com/download/iphone" r... | This little pupper can't wait for Christmas. H... | https://twitter.com/dog_rates/status/673363615... | 11 | 10 | None | pupper |
1967 | 673342308415348736 | 2015-12-06 03:24:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Django. He's a skilled assassin pupper... | https://twitter.com/dog_rates/status/673342308... | 10 | 10 | Django | pupper |
1970 | 673295268553605120 | 2015-12-06 00:17:55 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Eve. She's a raging alcoholic 8/10 (would... | https://twitter.com/dog_rates/status/673295268... | 8 | 10 | Eve | pupper |
1974 | 673148804208660480 | 2015-12-05 14:35:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Fletcher. He's had a ruff night. No mo... | https://twitter.com/dog_rates/status/673148804... | 8 | 10 | Fletcher | pupper |
1977 | 672988786805112832 | 2015-12-05 04:00:04 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Schnozz. He's had a blurred tail since... | https://twitter.com/dog_rates/status/672988786... | 10 | 10 | Schnozz | pupper |
1980 | 672975131468300288 | 2015-12-05 03:05:49 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Chuckles. He is one skeptical pupper. ... | https://twitter.com/dog_rates/status/672975131... | 10 | 10 | Chuckles | pupper |
1981 | 672970152493887488 | 2015-12-05 02:46:02 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Chet. He's having a hard time. Really ... | https://twitter.com/dog_rates/status/672970152... | 7 | 10 | Chet | pupper |
1985 | 672898206762672129 | 2015-12-04 22:00:08 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Cheryl AKA Queen Pupper of the Skies. ... | https://twitter.com/dog_rates/status/672898206... | 11 | 10 | Cheryl | pupper |
1991 | 672622327801233409 | 2015-12-04 03:43:54 +0000 | <a href="http://twitter.com/download/iphone" r... | This lil pupper is sad because we haven't foun... | https://twitter.com/dog_rates/status/672622327... | 12 | 10 | None | pupper |
1992 | 672614745925664768 | 2015-12-04 03:13:46 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Norman. Doesn't bark much. Very docile... | https://twitter.com/dog_rates/status/672614745... | 6 | 10 | Norman | pupper |
1995 | 672594978741354496 | 2015-12-04 01:55:13 +0000 | <a href="http://twitter.com/download/iphone" r... | Meet Scott. Just trying to catch his train to ... | https://twitter.com/dog_rates/status/672594978... | 9 | 10 | Scott | pupper |
2002 | 672481316919734272 | 2015-12-03 18:23:34 +0000 | <a href="http://twitter.com/download/iphone" r... | Say hello to Jazz. She should be on the cover ... | https://twitter.com/dog_rates/status/672481316... | 12 | 10 | Jazz | pupper |
2009 | 672254177670729728 | 2015-12-03 03:21:00 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Rolf. He's having the time of his life... | https://twitter.com/dog_rates/status/672254177... | 11 | 10 | Rolf | pupper |
2015 | 672205392827572224 | 2015-12-03 00:07:09 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Opal. He's a Royal John Coctostan. Rea... | https://twitter.com/dog_rates/status/672205392... | 9 | 10 | Opal | pupper |
2017 | 672160042234327040 | 2015-12-02 21:06:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Bubba. He's a Titted Peebles Aorta. Ev... | https://twitter.com/dog_rates/status/672160042... | 8 | 10 | Bubba | pupper |
335 rows × 9 columns
#测试
dfTwitter_Archive_Clean['stage'].value_counts()
pupper 220 doggo 72 puppo 23 floofer 9 doggopupper 9 doggofloofer 1 doggopuppo 1 Name: stage, dtype: int64
dfTwitter_Archive_Clean['name'] = dfTwitter_Archive_Clean['text'].str.extract(r'\S*[This is|Here is|Here\'s|named|Meet|Say hello to|Here we have]\s([A-Z][a-z]+).+',expand=True)
#测试
dfTwitter_Archive_Clean['name'].value_counts()
Charlie 11 Lucy 11 Oliver 11 Cooper 10 Tucker 9 Penny 9 Lola 8 Winston 8 Christmas 8 Sadie 8 Toby 8 Bo 7 Daisy 7 Jax 6 Oscar 6 Bailey 6 Koda 6 Bella 6 Stanley 6 Rusty 5 Leo 5 Bentley 5 Louis 5 Chester 5 Dave 5 Zoey 5 Boomer 5 Milo 5 Buddy 5 Scout 5 .. Hercules 1 Piers 1 Emma 1 Pixar 1 Carbon 1 Venezuelan 1 Fabio 1 Striker 1 Skittles 1 Geno 1 Teddy 1 Michelangelope 1 Forrest 1 Harper 1 Gromit 1 Parthenon 1 Am 1 Cermet 1 Valentine 1 Apollo 1 Skye 1 Margo 1 Tuscaloosa 1 Bowie 1 Simba 1 Ralpher 1 Stark 1 Mitsubishi 1 Karll 1 Bobb 1 Name: name, Length: 1063, dtype: int64
从原始数据中重新提取。对于一条推特中存在两条评分记录的情况,取前一组评分。
dfTwitter_Archive_Clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2094 entries, 0 to 2355 Data columns (total 9 columns): tweet_id 2094 non-null int64 timestamp 2094 non-null object source 2094 non-null object text 2094 non-null object expanded_urls 2094 non-null object rating_numerator 2094 non-null int64 rating_denominator 2094 non-null int64 name 1566 non-null object stage 335 non-null object dtypes: int64(3), object(6) memory usage: 163.6+ KB
dfTwitter_Archive_Clean['rating_numerator'],dfTwitter_Archive_Clean['rating_denominator'] = dfTwitter_Archive_Clean['text'].str.extract(r'([0-9]+\.?[0-9]*\/[0-9]+0)',expand=True)[0].str.split("/",1).str
#dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['rating_numerator'].str.find('.')!=-1]
#其中有一条特殊情况,评分为24/7,手工修改数据
modified_index = dfTwitter_Archive_Clean[dfTwitter_Archive_Clean['rating_numerator'].isnull()].index[0]
dfTwitter_Archive_Clean.loc[modified_index,'rating_numerator']=24
dfTwitter_Archive_Clean.loc[modified_index,'rating_denominator']=7
#修改类型为float
dfTwitter_Archive_Clean['rating_numerator'] = dfTwitter_Archive_Clean['rating_numerator'].astype(float)
dfTwitter_Archive_Clean['rating_denominator'] = dfTwitter_Archive_Clean['rating_denominator'].astype(float)
#测试
dfTwitter_Archive_Clean['rating_denominator'].value_counts()
10.0 2080 50.0 3 80.0 2 150.0 1 110.0 1 90.0 1 70.0 1 170.0 1 120.0 1 40.0 1 20.0 1 7.0 1 Name: rating_denominator, dtype: int64
dfTwitter_Archive_Clean['rating_numerator'].value_counts()
12.00 485 10.00 435 11.00 413 13.00 287 9.00 153 8.00 98 7.00 51 14.00 39 5.00 33 6.00 32 3.00 19 4.00 16 2.00 9 1.00 4 13.50 1 0.00 1 24.00 1 84.00 1 420.00 1 1776.00 1 80.00 1 60.00 1 44.00 1 144.00 1 88.00 1 11.26 1 11.27 1 121.00 1 9.75 1 99.00 1 204.00 1 45.00 1 165.00 1 50.00 1 Name: rating_numerator, dtype: int64
使用正则表达式提取URL
dfTwitter_Archive_Clean['source'] = dfTwitter_Archive_Clean['source'].str.extract(r'>(.+)<',expand=True)
#测试
dfTwitter_Archive_Clean.head()
tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | stage | |
---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | 2017-08-01 16:23:56 +0000 | Twitter for iPhone | This is Phineas. He's a mystical boy. Only eve... | https://twitter.com/dog_rates/status/892420643... | 13.0 | 10.0 | Phineas | NaN |
1 | 892177421306343426 | 2017-08-01 00:17:27 +0000 | Twitter for iPhone | This is Tilly. She's just checking pup on you.... | https://twitter.com/dog_rates/status/892177421... | 13.0 | 10.0 | Tilly | NaN |
2 | 891815181378084864 | 2017-07-31 00:18:03 +0000 | Twitter for iPhone | This is Archie. He is a rare Norwegian Pouncin... | https://twitter.com/dog_rates/status/891815181... | 12.0 | 10.0 | Archie | NaN |
3 | 891689557279858688 | 2017-07-30 15:58:51 +0000 | Twitter for iPhone | This is Darla. She commenced a snooze mid meal... | https://twitter.com/dog_rates/status/891689557... | 13.0 | 10.0 | Darla | NaN |
4 | 891327558926688256 | 2017-07-29 16:00:24 +0000 | Twitter for iPhone | This is Franklin. He would like you to stop ca... | https://twitter.com/dog_rates/status/891327558... | 12.0 | 10.0 | Franklin | NaN |
删除这些重复值
dfImage_Predictions_Clean.drop_duplicates(subset='jpg_url',inplace=True)
#测试
dfImage_Predictions_Clean['jpg_url'].duplicated().sum()
0
字母全部使用小写。分隔则全部使用下划线。
dfImage_Predictions_Clean[['p1','p2','p3']] = dfImage_Predictions_Clean[['p1','p2','p3']].applymap(str.lower)
dfImage_Predictions_Clean[['p1','p2','p3']] = dfImage_Predictions_Clean[['p1','p2','p3']].replace(' ','_').replace('-','_')
#测试
dfImage_Predictions_Clean.head(5)
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | shetland_sheepdog | 0.061428 | True |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | rhodesian_ridgeback | 0.072010 | True |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | german_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | rottweiler | 0.243682 | True | doberman | 0.154629 | True |
使用merge合并三个Dataframe
#合并Dataframe
dfClean = pd.merge(dfTwitter_Archive_Clean,dfImage_Predictions_Clean,how='inner',on='tweet_id').merge(dfJson_Tweeter_Clean,how='left',on='tweet_id')
dfClean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1971 entries, 0 to 1970 Data columns (total 22 columns): tweet_id 1971 non-null int64 timestamp 1971 non-null object source 1971 non-null object text 1971 non-null object expanded_urls 1971 non-null object rating_numerator 1971 non-null float64 rating_denominator 1971 non-null float64 name 1520 non-null object stage 303 non-null object jpg_url 1971 non-null object img_num 1971 non-null int64 p1 1971 non-null object p1_conf 1971 non-null float64 p1_dog 1971 non-null bool p2 1971 non-null object p2_conf 1971 non-null float64 p2_dog 1971 non-null bool p3 1971 non-null object p3_conf 1971 non-null float64 p3_dog 1971 non-null bool retweet_count 1971 non-null int64 favorite_count 1971 non-null int64 dtypes: bool(3), float64(5), int64(4), object(10) memory usage: 313.7+ KB
dfClean.tail()
tweet_id | timestamp | source | text | expanded_urls | rating_numerator | rating_denominator | name | stage | jpg_url | ... | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | retweet_count | favorite_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1966 | 666049248165822465 | 2015-11-16 00:24:50 +0000 | Twitter for iPhone | Here we have a 1949 1st generation vulpix. Enj... | https://twitter.com/dog_rates/status/666049248... | 5.0 | 10.0 | Fox | NaN | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | ... | 0.560311 | True | rottweiler | 0.243682 | True | doberman | 0.154629 | True | 41 | 111 |
1967 | 666044226329800704 | 2015-11-16 00:04:52 +0000 | Twitter for iPhone | This is a purebred Piers Morgan. Loves to Netf... | https://twitter.com/dog_rates/status/666044226... | 6.0 | 10.0 | Piers | NaN | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | ... | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True | 147 | 309 |
1968 | 666033412701032449 | 2015-11-15 23:21:54 +0000 | Twitter for iPhone | Here is a very happy pup. Big fan of well-main... | https://twitter.com/dog_rates/status/666033412... | 9.0 | 10.0 | NaN | NaN | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | ... | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True | 47 | 128 |
1969 | 666029285002620928 | 2015-11-15 23:05:30 +0000 | Twitter for iPhone | This is a western brown Mitsubishi terrier. Up... | https://twitter.com/dog_rates/status/666029285... | 7.0 | 10.0 | Mitsubishi | NaN | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | ... | 0.506826 | True | miniature_pinscher | 0.074192 | True | rhodesian_ridgeback | 0.072010 | True | 48 | 132 |
1970 | 666020888022790149 | 2015-11-15 22:32:08 +0000 | Twitter for iPhone | Here we have a Japanese Irish Setter. Lost eye... | https://twitter.com/dog_rates/status/666020888... | 8.0 | 10.0 | Japanese | NaN | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | ... | 0.465074 | True | collie | 0.156665 | True | shetland_sheepdog | 0.061428 | True | 530 | 2528 |
5 rows × 22 columns
#保存文件
dfClean.to_csv('twitter_archive_master.csv', index=False)
#取姓名的排名
dfNameRanking = pd.DataFrame(dfClean.groupby('name',as_index=False)['tweet_id'].count().sort_values(by='tweet_id',ascending=False))
dfNameRanking = dfNameRanking.rename(columns={'tweet_id':'counts'})
dfNameRanking.reset_index(drop=True,inplace=True)
dfNameRanking.head(10)
name | counts | |
---|---|---|
0 | Oliver | 11 |
1 | Charlie | 11 |
2 | Cooper | 10 |
3 | Lucy | 10 |
4 | Penny | 9 |
5 | Tucker | 9 |
6 | Toby | 8 |
7 | Winston | 8 |
8 | Sadie | 8 |
9 | Christmas | 8 |
lbName = np.full(len(dfNameRanking.name.value_counts()),"",dtype=object)
lbName[0]='Oliver'
lbName[1]='Charlie'
dfNameRanking.counts.plot(kind='pie',labels=lbName);
#选取所有狗的品种
dfVariety = pd.DataFrame(columns=['tweet_id','variety'])
for i in dfClean.index:
if dfClean.p1_dog.loc[i] == True:
dfVariety = dfVariety.append({'tweet_id':str(dfClean.tweet_id.loc[i]),'variety':dfClean.p1.loc[i]},ignore_index=True)
elif dfClean.p2_dog.loc[i] == True:
dfVariety = dfVariety.append({'tweet_id':str(dfClean.tweet_id.loc[i]),'variety':dfClean.p2.loc[i]},ignore_index=True)
elif dfClean.p3_dog.loc[i] == True:
dfVariety = dfVariety.append({'tweet_id':str(dfClean.tweet_id.loc[i]),'variety':dfClean.p3.loc[i]},ignore_index=True)
else:
dfVariety = dfVariety.append({'tweet_id':str(dfClean.tweet_id.loc[i]),'variety':np.nan},ignore_index=True)
dfVariety['tweet_id'] = dfVariety['tweet_id'].astype(int)
#去除空值
dfVariety.dropna(subset=['variety'],inplace=True)
#合并获得转发和点赞
dfVariety = pd.merge(dfVariety,dfClean[['tweet_id','retweet_count','favorite_count']],how='left',on='tweet_id')
dfVariety.head()
tweet_id | variety | retweet_count | favorite_count | |
---|---|---|---|---|
0 | 892177421306343426 | chihuahua | 6480 | 33786 |
1 | 891815181378084864 | chihuahua | 4301 | 25445 |
2 | 891689557279858688 | labrador_retriever | 8925 | 42863 |
3 | 891327558926688256 | basset | 9721 | 41016 |
4 | 891087950875897856 | chesapeake_bay_retriever | 3240 | 20548 |
#计算种类的排名,通过转推和喜欢数相加来计算
dfRanking = dfVariety.groupby('variety').sum().dropna()[['retweet_count','favorite_count']]
dfRanking['total'] = dfRanking['retweet_count']+dfRanking['favorite_count']
dfCountTweet = dfVariety.groupby('variety').size().rename('counts')
dfTempRanking = dfRanking.join(dfCountTweet).sort_values(by='total',ascending=False).head(10)
#dfTempRanking.reset_index(drop=True,inplace=True)
dfTempRanking
retweet_count | favorite_count | total | counts | |
---|---|---|---|---|
variety | ||||
golden_retriever | 589274.0 | 1953598.0 | 2542872.0 | 156 |
labrador_retriever | 403152.0 | 1260635.0 | 1663787.0 | 106 |
pembroke | 290323.0 | 1035633.0 | 1325956.0 | 94 |
chihuahua | 252834.0 | 753813.0 | 1006647.0 | 90 |
samoyed | 202313.0 | 582082.0 | 784395.0 | 42 |
french_bulldog | 155290.0 | 568978.0 | 724268.0 | 31 |
chow | 133512.0 | 456699.0 | 590211.0 | 48 |
cocker_spaniel | 147681.0 | 413968.0 | 561649.0 | 30 |
pug | 118051.0 | 382463.0 | 500514.0 | 62 |
malamute | 108983.0 | 350710.0 | 459693.0 | 33 |
dfTempRanking['total'].plot(kind='bar',figsize=(8,5),color='#e597b2');
dfTempRanking['counts'].plot(kind='bar',figsize=(8,5),color='#00a3af');
#通过回归来探索相关性
#因为存在不为10的基准评分(如多条狗的情况),通过rating_numerator / rating_denominator来计算
dfTempRate = dfClean[['rating_numerator','rating_denominator','retweet_count','favorite_count']].dropna()
dfTempRate['Rate']=dfTempRate['rating_numerator'] / dfTempRate['rating_denominator']
#去掉异常值
dfTempRate = dfTempRate[dfTempRate['Rate']<2]
sns.lmplot(x='Rate', y='favorite_count',data=dfTempRate,size=7);
sns.lmplot(x='Rate', y='retweet_count',data=dfTempRate,size=7);