import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
#from scipy.stats import ks_2samp
#import glob
#import time
#import lxml
import altair as alt
#import langdetect as land
#from mpl_toolkits import mplot3d
import plotly
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact
import seaborn as sns
from sklearn.linear_model import LinearRegression
import warnings
#import statsmodels.api as sm
import statsmodels.formula.api as sm


tweet = pd.read_excel(os.path.join('Twitter+data+in+sheets.xlsx'), 'Tweet')
loca = pd.read_excel(os.path.join('Twitter+data+in+sheets.xlsx'), 'Location')
user = pd.read_excel(os.path.join('Twitter+data+in+sheets.xlsx'), 'User')


tweet = tweet.rename(str.strip,axis = 1)
loca = loca.rename(str.strip,axis = 1)
user = user.rename(str.strip,axis = 1)
merged = tweet.merge(user, left_on = 'UserID', right_on = 'UserID', how = 'left').merge(loca, left_on = 'LocationID',right_on = 'LocationID')
eng = merged[merged['Lang'] == 'en']
eng['length'] = eng['text'].str.len()
warnings.filterwarnings('ignore')
notloc = eng.drop(columns = ['Country','State','StateCode','City','LocationID']).dropna()
eng.head()

/var/folders/38/bcx9z7z945s379nmdtks67740000gn/T/ipykernel_11776/1440104292.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng['length'] = eng['text'].str.len()


notloc['time inteval'] = notloc["Hour"].apply(
    lambda x: "Early morning" if (x >=6 and x < 9) 
    else "Late morning" if (x >= 9 and x < 12) 
    else "Noon" if (x >= 12 and x < 15) 
    else "Afternoon" if (x >= 15 and x < 18) 
    else "Evening" if (x >= 18 and x < 21) 
    else "Early night" if (x >= 21 and x < 24)
    else 'Late night')

notloc['Reach'] = notloc['Reach'].apply(lambda x: 1 if x==0 else x)


with pd.ExcelWriter(os.path.join('Stata reg.xlsx'), engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    s = notloc[['Weekday',
        'IsReshare','Reach','RetweetCount','Likes','Klout','Sentiment',
        'Gender','time inteval','length']]
    s.to_excel(writer, 'sheet1', index = False, header = True)


from IPython.display import IFrame
IFrame(os.path.join('Stats result.pdf'), width=600, height=300)


stata = pd.read_stata('stata_out.dta')
#stata['root_reach'] = stata['Reach'].apply(lambda x:x**(3/4))
counted = stata['timeinteval'].value_counts()
stata['time_weighted_count'] = stata['timeinteval'].apply(lambda x:1/counted[x])
stata.head()


brush = alt.selection(type='interval')
stata_s = stata.sample(2000, random_state = 2022522)
points = alt.Chart(stata_s).mark_point().encode(
    x='length:Q',
    y='log_reach:Q',
    color=alt.condition(brush, 'timeinteval:N', alt.value('lightgray'))
).add_selection(
    brush
).properties(
    width=650,
    height=560
)

bars = alt.Chart(stata_s).mark_bar().encode(
    y='timeinteval:N',
    color='timeinteval:N',
    x='sum(time_weighted_count):Q'
).transform_filter(
    brush
)

points & bars


#Early_morning----log_reach
#ignore Early_night, Weekdaydummy7, Genderdummy4
def inter(vari = 'Early_morning'):
    lis = ['length','Sentiment', 'Weekdaydummy1','Weekdaydummy2',
            'Weekdaydummy3','Weekdaydummy4','Weekdaydummy5','Weekdaydummy6',
             'Genderdummy1','Genderdummy2','Genderdummy3',
               'Late_night', 'Evening' ,'Late_morning','Afternoon','Noon','Early_morning']
    lis.remove(vari)
    plot_df = pd.DataFrame()
    lr = LinearRegression()
    lr1 = LinearRegression()
    lr2 = LinearRegression()
    X = stata[lis]
    lr.fit(X=X, y=stata[vari])
    re = lr.predict(X) - stata[vari]

    lr1.fit(X=X, y = stata['log_reach'])
    re1 = lr1.predict(X) - stata['log_reach']

    lr2.fit(X=pd.DataFrame(re), y=re1)
    plot_df['re'] = re
    plot_df['re1'] = re1
    sns.lmplot('re', 're1', data=plot_df)
    return 'coefficient = '+ str(lr2.coef_)
    #plot_df['category'] = 'e_m'
def only(vari = 'Early_morning'):
    lis = ['length','Sentiment', 'Weekdaydummy1','Weekdaydummy2',
            'Weekdaydummy3','Weekdaydummy4','Weekdaydummy5','Weekdaydummy6',
             'Genderdummy1','Genderdummy2','Genderdummy3',
               'Late_night', 'Evening' ,'Late_morning','Afternoon','Noon','Early_morning']
    lis.remove(vari)
    plot_df = pd.DataFrame()
    lr = LinearRegression()
    lr1 = LinearRegression()
    lr2 = LinearRegression()
    X = stata[lis]
    lr.fit(X=X, y=stata[vari])
    re = lr.predict(X) - stata[vari]

    lr1.fit(X=X, y = stata['log_reach'])
    re1 = lr1.predict(X) - stata['log_reach']

    lr2.fit(X=pd.DataFrame(re), y=re1)
    plot_df['re'] = re
    plot_df['re1'] = re1
    return plot_df


warnings.filterwarnings('ignore')
options = ['Late_night', 'Evening' ,'Late_morning','Afternoon','Noon','Early_morning']
interact(inter, x = options);
#['Late_night', 'Evening' ,'Late_morning','Afternoon','Noon','Early_morning']

interactive(children=(Text(value='Early_morning', description='vari'), Output()), _dom_classes=('widget-intera…


only

<function __main__.only(vari='Early_morning')>


warnings.filterwarnings('default')
plots = {option: only(option) for option in options}
fig, axes = plt.subplots(2, 3, figsize=(12,12))
m = 0
for k, v in plots.items():
    ax = axes[m//3, m % 3]
    sns.regplot(data=v, x='re', y='re1', ax=ax)
    lr = LinearRegression()
    lr.fit(X = v[['re']], y = v['re1'])
    ax.set_title('%s with coeffiecient %s' % (k, lr.coef_))
    m+=1
plt.tight_layout()


l = ['length','Sentiment', 'Weekdaydummy1','Weekdaydummy2',
            'Weekdaydummy3','Weekdaydummy4','Weekdaydummy5','Weekdaydummy6',
             'Genderdummy1','Genderdummy2','Genderdummy3',
               'Late_night', 'Evening' ,'Late_morning','Afternoon','Noon','Early_morning','#']
stata['hashtag'] = notloc.reset_index()['text'].str.contains('#').apply(lambda x:1 if x else 0)
#lr = LinearRegression()
#lr.fit(X=stata[l], y = stata['log_reach'])
#lr.coef_
reg = sm.ols(formula = 'log_reach ~ hashtag+length+Sentiment+Weekdaydummy1+Weekdaydummy2+Weekdaydummy3+Weekdaydummy4+Weekdaydummy5+Weekdaydummy6+Genderdummy1+Genderdummy2+Genderdummy3+Late_night+Evening+Late_morning+Afternoon+Noon+Early_morning',data = stata).fit()
reg.summary()

	TweetID	Weekday	Hour	Day	Lang	IsReshare	Reach	RetweetCount	Klout	Sentiment	text	LocationID	UserID	Gender	Country	State	StateCode	City	length
0	tw-682712873332805633	Thursday	17.0	31.0	en	0.0	44.0	0.0	35.0	0.0	We are hiring: Senior Software Engineer - Prot...	3751.0	tw-40932430	Female	United States	Washington	US-WA	Seattle	242.0
1	tw-682714048199311366	Thursday	17.0	31.0	en	0.0	953.0	0.0	47.0	0.0	Amazon is hiring! #Sr. #International Tax Mana...	3751.0	tw-3172686669	Unknown	United States	Washington	US-WA	Seattle	303.0
2	tw-682719022958292992	Thursday	17.0	31.0	en	1.0	86.0	51.0	16.0	2.0	RT @awscloud: AWS CloudFormation now supports ...	3751.0	tw-14458322	Unisex	United States	Washington	US-WA	Seattle	492.0
3	tw-682763193471643648	Thursday	20.0	31.0	en	0.0	45.0	0.0	36.0	0.0	We are hiring: Software Developer Engineer So ...	3751.0	tw-40932430	Female	United States	Washington	US-WA	Seattle	242.0
4	tw-682777775258865664	Thursday	21.0	31.0	en	1.0	610.0	1.0	40.0	0.0	RT @tdhopper: AWS Lambda opens so many possibi...	3751.0	tw-421336284	Male	United States	Washington	US-WA	Seattle	81.0

	Weekday	IsReshare	Reach	RetweetCount	Klout	Sentiment	Gender	timeinteval	length	...	Evening	Genderdummy1	Genderdummy2	Genderdummy3	Genderdummy4	log_reach	time_weighted_count
0	Thursday	0	44	0	35	0.0	Female	Afternoon	242	...	0	1	0	0	0	3.784190	0.000074
1	Thursday	0	953	0	47	0.0	Unknown	Afternoon	303	...	0	0	0	0	1	6.859615	0.000074
2	Thursday	1	86	51	16	2.0	Unisex	Afternoon	492	...	0	0	0	1	0	4.454347	0.000074
3	Thursday	0	45	0	36	0.0	Female	Evening	242	...	1	1	0	0	0	3.806663	0.000108
4	Thursday	1	610	1	40	0.0	Male	Early night	81	...	0	0	1	0	0	6.413459	0.000132

Dep. Variable:	log_reach	R-squared:	0.019
Model:	OLS	Adj. R-squared:	0.018
Method:	Least Squares	F-statistic:	98.89
Date:	Wed, 15 Jun 2022	Prob (F-statistic):	0.00
Time:	09:25:26	Log-Likelihood:	-1.9393e+05
No. Observations:	93563	AIC:	3.879e+05
Df Residuals:	93544	BIC:	3.881e+05
Df Model:	18
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	6.3260	0.032	195.478	0.000	6.263	6.389
hashtag	-0.0531	0.013	-4.121	0.000	-0.078	-0.028
length	-0.0004	7.38e-05	-5.418	0.000	-0.001	-0.000
Sentiment	-0.0079	0.006	-1.350	0.177	-0.019	0.004
Weekdaydummy1	0.0114	0.022	0.520	0.603	-0.032	0.054
Weekdaydummy2	0.0511	0.022	2.347	0.019	0.008	0.094
Weekdaydummy3	-0.0554	0.027	-2.018	0.044	-0.109	-0.002
Weekdaydummy4	-0.0164	0.028	-0.585	0.559	-0.071	0.039
Weekdaydummy5	0.0430	0.021	2.069	0.039	0.002	0.084
Weekdaydummy6	-0.0213	0.021	-1.029	0.303	-0.062	0.019
Genderdummy1	-0.6198	0.024	-25.752	0.000	-0.667	-0.573
Genderdummy2	-0.3965	0.015	-27.045	0.000	-0.425	-0.368
Genderdummy3	-0.2719	0.024	-11.450	0.000	-0.318	-0.225
Late_night	0.0685	0.027	2.566	0.010	0.016	0.121
Evening	0.2031	0.030	6.813	0.000	0.145	0.262
Late_morning	0.4255	0.027	16.032	0.000	0.374	0.478
Afternoon	0.4259	0.028	15.432	0.000	0.372	0.480
Noon	0.4221	0.027	15.677	0.000	0.369	0.475
Early_morning	0.3368	0.028	12.139	0.000	0.282	0.391

Omnibus:	3922.637	Durbin-Watson:	1.359
Prob(Omnibus):	0.000	Jarque-Bera (JB):	6924.732
Skew:	0.346	Prob(JB):	0.00
Kurtosis:	4.140	Cond. No.	2.14e+03

Part of Twitter Data Analysis¶

Table of Contents

Import data¶

data processing¶

Output Current Work File to Stata (I was just curious about to what extent Python and Stata could interact)¶

Frisch-Waugh Regress¶

Influence of adding hashtage \¶