-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre-process.py
81 lines (67 loc) · 3.04 KB
/
pre-process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
os.system('pip install nltk scikit-learn matplotlib seaborn wordcloud')
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from azureml.core import Run, Dataset, Datastore
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datetime import datetime
from azureml.data.dataset_factory import FileDatasetFactory
# Ensure required NLTK data is downloaded
nltk.data.path.append('/root/nltk_data')
nltk.download('stopwords', download_dir='/root/nltk_data')
nltk.download('punkt_tab', download_dir='/root/nltk_data')
stop_words = set(stopwords.words('english'))
def filter_tweets(df, min_likes=10, start_date='2023-04-01', end_date='2023-04-30'):
# Convert timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
# Filter based on likes and date range
filtered_df = df[(df['Likes'] >= min_likes) & (df['Timestamp'] >= start_date) & (df['Timestamp'] <= end_date)]
return filtered_df
def preprocess_tweet(tweet):
word_tokens = word_tokenize(tweet)
filtered_tweet = [w for w in word_tokens if not w in stop_words]
return ' '.join(filtered_tweet)
def azureml_main(dataframe1=None, dataframe2=None):
run = Run.get_context(allow_offline=True)
ws = run.experiment.workspace
# dataset = Dataset.get_by_name(ws, name='twitter-dataset-345')
df = dataframe1
print("Column Names:", df.columns)
# Preprocessed Text is the column sent
df['Cleaned_Tweets'] = df['Preprocessed Text'].apply(preprocess_tweet)
# Filter tweets based on likes and timestamp
df = filter_tweets(df)
output_dir = 'outputs'
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Generate a unique timestamp
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
# Example plot: Word Cloud of Cleaned Tweets
from wordcloud import WordCloud
all_words = ' '.join(df['Cleaned_Tweets'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cleaned Tweets')
wordcloud_path = os.path.join(output_dir, f'wordcloud_{timestamp}.png')
plt.savefig(wordcloud_path)
# Log the word cloud image
run.log_image("Word Cloud", path=wordcloud_path)
# Example plot: Bar plot of tweet lengths using distplot
df['Tweet_Length'] = df['Cleaned_Tweets'].apply(len)
plt.figure(figsize=(10, 5))
sns.distplot(df['Tweet_Length'], kde=True)
plt.title('Distribution of Tweet Lengths')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
tweet_length_distribution_path = os.path.join(output_dir, f'tweet_length_distribution_{timestamp}.png')
plt.savefig(tweet_length_distribution_path)
# Log the tweet length distribution image
run.log_image("Tweet Length Distribution", path=tweet_length_distribution_path)
return df, # Ensure the return type is a tuple of dataframes