Spaces:

ameythakur
/

Depression-Detection-Using-Tweets

Sleeping

App Files Files Community

Depression-Detection-Using-Tweets / source_code /notebooks /data_gathering_twitter_API.py

ameythakur

DEPRESSION-DETECTION

4d1cb0c verified 3 months ago

raw

history blame contribute delete

20.6 kB

	# -- coding: utf-8 --
	"""Twitter_API.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1UAilj_PmxYbwHsc_s79d9UyBvawBVZAS

	# Tweet mining using Twitter API via Tweepy:

	In this notebook I am using Tweepy python library to tweets using relevant hashtags. I was able to retrieve around 19000 unique tweets via twitter API. At the end, all the datasets with different depressive hashtags will be combined, cleaned and saved as depressive_tweets.csv.
	"""

	from google.colab import drive
	drive.mount('/content/drive')

	"""## Tweets mining"""

	!pip install -qqq tweepy

	## Import required libraries
	import tweepy
	from tweepy.streaming import StreamListener
	from tweepy import OAuthHandler
	from tweepy import Stream
	import csv
	import pandas as pd

	## Access to twitter API cunsumer_key and access_secret
	#import config.ipynb

	## Twitter API related information
	consumer_key = config.API_KEY
	consumer_secret = config.API_KEY_SECRET
	access_key= config.ACCESS_TOKEN
	access_secret = config.ACCESS_TOKEN_SECRET

	auth = tweepy.OAuthHandler(consumer_key, consumer_secret) # Pass in Consumer key and secret for authentication by API
	auth.set_access_token(access_key, access_secret) # Pass in Access key and secret for authentication by API
	api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # Sleeps when API limit is reached

	## depress_tags = ["#depressed", "#anxiety", "#depression", "#suicide", "#mentalhealth"
	## "#loneliness", "#hopelessness", "#itsokaynottobeokay", "#sad"]

	"""## "#depressed""""

	## Create a function for tweets mining
	def tweets_mining1(search_query1, num_tweets1, since_id_num1):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list1 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query1, lang="en", since_id=since_id_num1,
	tweet_mode='extended').items(num_tweets1)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list1[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv','a', newline='', encoding='utf-8') as csvFile1:
	csv_writer1 = csv.writer(csvFile1, delimiter=',') # create an instance of csv object
	csv_writer1.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words1 = "#depressed" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query1 = search_words1 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining1(search_query1, 1000, latest_tweet)

	df_depressed_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depressed_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_depressed_1

	## Finding unique values in each column
	for col in df_depressed_1:
	print("There are ", len(df_depressed_1[col].unique()), "unique values in ", col)

	"""### Anxiety and suicide """

	## Create a function for tweets mining
	def tweets_mining2(search_query2, num_tweets2, since_id_num2):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list2 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query2, lang="en", since_id=since_id_num2,
	tweet_mode='extended').items(num_tweets2)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list2[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv','a', newline='', encoding='utf-8') as csvFile2:
	csv_writer2 = csv.writer(csvFile2, delimiter=',') # create an instance of csv object
	csv_writer2.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words2 = "#anxiety" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query2 = search_words2 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining2(search_query2, 2000, latest_tweet)

	df_anxiety_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_anxiety_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_anxiety_1

	## Finding unique values in each column
	for col in df_anxiety_1:
	print("There are ", len(df_anxiety_1[col].unique()), "unique values in ", col)

	"""## "#Suicide""""

	## Create a function for tweets mining
	def tweets_mining3(search_query3, num_tweets3, since_id_num3):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list3 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query3, lang="en", since_id=since_id_num3,
	tweet_mode='extended').items(num_tweets3)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list3[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv','a', newline='', encoding='utf-8') as csvFile3:
	csv_writer3 = csv.writer(csvFile3, delimiter=',') # create an instance of csv object
	csv_writer3.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words3 = "#suicide" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query3 = search_words3 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining3(search_query3, 10000, latest_tweet)

	df_suicide_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_suicide_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_suicide_1

	"""## "#hopelessness""""

	## Create a function for tweets mining
	def tweets_mining4(search_query4, num_tweets4, since_id_num4):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list4 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query4, lang="en", since_id=since_id_num4,
	tweet_mode='extended').items(num_tweets4)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list4[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv','a', newline='', encoding='utf-8') as csvFile4:
	csv_writer4 = csv.writer(csvFile4, delimiter=',') # create an instance of csv object
	csv_writer4.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words4 = "#hopelessness" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query4 = search_words4 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining4(search_query4, 10000, latest_tweet)

	df_hopeless_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_hopeless_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_hopeless_1

	"""## "#mentalhealth""""

	## Create a function for tweets mining
	def tweets_mining5(search_query5, num_tweets5, since_id_num5):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list5 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query5, lang="en", since_id=since_id_num5,
	tweet_mode='extended').items(num_tweets5)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list5[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv','a', newline='', encoding='utf-8') as csvFile5:
	csv_writer5 = csv.writer(csvFile5, delimiter=',') # create an instance of csv object
	csv_writer5.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words5 = "#mentalhealth" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query5 = search_words5 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining5(search_query5, 1000, latest_tweet)

	df_mentalhealth_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_mentalhealth_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_mentalhealth_1

	"""## "#loneliness""""

	## Create a function for tweets mining
	def tweets_mining6(search_query6, num_tweets6, since_id_num6):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list6 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query6, lang="en", since_id=since_id_num6,
	tweet_mode='extended').items(num_tweets6)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list6[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv','a', newline='', encoding='utf-8') as csvFile6:
	csv_writer6 = csv.writer(csvFile6, delimiter=',') # create an instance of csv object
	csv_writer6.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words6 = "#loneliness" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query6 = search_words6 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining6(search_query6, 10000, latest_tweet)

	df_loneliness_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_loneliness_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_loneliness_1

	"""## "#itsokaynottobeokay""""

	## Create a function for tweets mining
	def tweets_mining7(search_query7, num_tweets7, since_id_num7):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list7 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query7, lang="en", since_id=since_id_num7,
	tweet_mode='extended').items(num_tweets7)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list7[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv','a', newline='', encoding='utf-8') as csvFile7:
	csv_writer7 = csv.writer(csvFile7, delimiter=',') # create an instance of csv object
	csv_writer7.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words7 = "#itsokaynottobeokay" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query7 = search_words7 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining7(search_query7, 2000, latest_tweet)

	df_itsok_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_itsoknottobeok_1 copy.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_itsok_1

	"""## "#depression""""

	## Create a function for tweets mining
	def tweets_mining8(search_query8, num_tweets8, since_id_num8):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list8 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query8, lang="en", since_id=since_id_num8,
	tweet_mode='extended').items(num_tweets8)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list8[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv','a', newline='', encoding='utf-8') as csvFile8:
	csv_writer8 = csv.writer(csvFile8, delimiter=',') # create an instance of csv object
	csv_writer8.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words8 = "#depression" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query8 = search_words8 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining8(search_query8, 1000, latest_tweet)

	df_depression_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_depression_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_depression_1

	## Finding unique values in each column
	for col in df_depression_1:
	print("There are ", len(df_depression_1[col].unique()), "unique values in ", col)

	"""## "#sad""""

	## Create a function for tweets mining
	def tweets_mining9(search_query9, num_tweets9, since_id_num9):
	# Collect tweets using the Cursor object
	# Each item in the iterator has various attributes that you can access to get information about each tweet
	tweet_list9 = [tweets for tweets in tweepy.Cursor(api.search, q=search_query9, lang="en", since_id=since_id_num9,
	tweet_mode='extended').items(num_tweets9)]

	# Begin scraping the tweets individually:
	for tweet in tweet_list9[::-1]:
	tweet_id = tweet.id # get Tweet ID result
	created_at = tweet.created_at # get time tweet was created
	text = tweet.full_text # retrieve full tweet text
	location = tweet.user.location # retrieve user location
	retweet = tweet.retweet_count # retrieve number of retweets
	favorite = tweet.favorite_count # retrieve number of likes
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv','a', newline='', encoding='utf-8') as csvFile9:
	csv_writer9 = csv.writer(csvFile9, delimiter=',') # create an instance of csv object
	csv_writer9.writerow([tweet_id, created_at, text, location, retweet, favorite]) # write each row

	search_words9 = "#sad" # Specifying exact phrase to search
	# Exclude Links, retweets, replies
	search_query9 = search_words9 + " -filter:links AND -filter:retweets AND -filter:replies"
	with open('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv', encoding='utf-8') as data:
	latest_tweet = int(list(csv.reader(data))[-1][0])
	tweets_mining9(search_query9, 2000, latest_tweet)

	df_sad_1 = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/tweets_sad_1.csv",
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"])

	df_sad_1

	"""# Combining all the tweets"""

	import glob

	path = r'/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API' # use your path
	all_files = glob.glob(path + "/*.csv")

	tweets = []

	for filename in all_files:
	df = pd.read_csv(filename,
	names=['tweet.id', "created_at","text", "location", "retweet", "favorite"]) # Convert each csv to a dataframe
	tweets.append(df)

	tweets_df = pd.concat(tweets, ignore_index=True) # Merge all dataframes
	#tweets_df.columns=['tweet.id', "created_at","text", "location", "retweet", "favorite"]
	tweets_df.head()

	tweets_df

	tweets_df.to_csv('/content/drive/MyDrive/NLP/Depression_Detection/Data_fetch_API/output/depressive_tweets.csv')

	"""## Data cleaning

	Data cleaning is one of the essential steps because without a proper cleaning procedure you will have errors in your analysis and eventually your data-driven results. Here I try to eliminate duplicates tweets by using the Primary key ('tweets.id'), checked for empty rows and replaced “NaN” if there is any.
	"""

	tweets_df.shape #Get number of rows and columns

	## Check the data type of each column
	tweets_df.dtypes.to_frame().rename(columns={0:'data_type'})

	## Finding unique values in each column
	for col in tweets_df:
	print("There are ", len(tweets_df[col].unique()), "unique values in ", col)