diff --git a/environment.yml b/environment.yml index e26c2ab..b8adf8d 100644 --- a/environment.yml +++ b/environment.yml @@ -2,53 +2,53 @@ name: rsmetrics channels: - defaults dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=4.5=1_gnu - - ca-certificates=2022.3.18=h06a4308_0 - - certifi=2021.10.8=py39h06a4308_2 - - ld_impl_linux-64=2.35.1=h7274673_9 - - libffi=3.3=he6710b0_2 - - libgcc-ng=9.3.0=h5101ec6_17 - - libgomp=9.3.0=h5101ec6_17 - - libstdcxx-ng=9.3.0=hd4cf53a_17 - - ncurses=6.3=h7f8727e_2 - - openssl=1.1.1n=h7f8727e_0 - - pip=21.2.4=py39h06a4308_0 - - python=3.9.11=h12debd9_2 - - readline=8.1.2=h7f8727e_1 - - setuptools=58.0.4=py39h06a4308_0 - - sqlite=3.38.0=hc218d9a_0 - - tk=8.6.11=h1ccaba5_0 - - tzdata=2021e=hda174b7_0 - - wheel=0.37.1=pyhd3eb1b0_0 - - xz=5.2.5=h7b6447c_0 - - zlib=1.2.11=h7f8727e_4 + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - ca-certificates=2022.3.18 + - ld_impl_linux-64=2.35.1 + - libffi=3.3 + - libgcc-ng=9.3.0 + - libgomp=9.3.0 + - libstdcxx-ng=9.3.0 + - ncurses=6.3 + - openssl=1.1.1n + - pip=21.2.4 + - python=3.9.11 + - readline=8.1.2 + - setuptools=58.0.4 + - sqlite=3.38.0 + - tk=8.6.11 + - tzdata=2021e + - wheel=0.37.1 + - xz=5.2.5 + - zlib=1.2.11 - pip: - beautifulsoup4==4.10.0 - - certifi==2021.10.8 + - certifi==2022.12.7 - charset-normalizer==2.0.12 - click==8.1.3 - - Flask==2.1.2 + - flask==2.1.2 + - flask-pymongo==2.3.0 - idna==3.3 - importlib-metadata==4.11.4 - itsdangerous==2.1.2 - - Jinja2==3.1.2 + - jinja2==3.1.2 - joblib==1.2.0 - - MarkupSafe==2.1.1 + - markupsafe==2.1.1 - natsort==8.1.0 - numpy==1.22.3 - pandas==1.4.2 + - pyarrow==10.0.1 - pymongo==4.1.0 + - pymongoarrow==0.6.2 - python-dateutil==2.8.2 - python-dotenv==0.20.0 - pytz==2022.1 - - PyYAML==6.0 + - pyyaml==6.0 - requests==2.27.1 - scipy==1.8.0 - six==1.16.0 - soupsieve==2.3.2 - urllib3==1.26.9 - - Werkzeug==2.1.2 + - werkzeug==2.1.2 - zipp==3.8.0 - - flask-pymongo==2.3.0 - - pymongoarrow==0.6.2 diff --git a/metric_descriptions/accuracy.yml b/metric_descriptions/accuracy.yml new file mode 100644 index 0000000..5b92f19 --- /dev/null +++ b/metric_descriptions/accuracy.yml @@ -0,0 +1,42 @@ +name: Accuracy + +summary: > + Measures Recommendations' accuracy based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction + +description: > + The accuracy (\(A\)) of the recommendations is based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction. Generally, the Accuracy mathematical expression is defined as: + $$A=\frac{Number\;of\;correct\;predictions}{Total\;number\;of\;predictions}$$In RS Metrics the computation is determined by the following formula: + $$Accuracy=\frac{Number\;of\;correctly\;recommended\;services}{Total\;number\;of\;services}$$where correctness is defined as if the service is both accessed by the user and also it is recommended by the RS + +output: + type: float + min: 0 + max: 1 + comment: A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction. + +prerequisites: + - recommendations without anonymous users + - all available users (with their accessed services) + - all available services + +process: + - step: Clean up + details: > + Recommendations clean up; entries removal where users or services are not found in "users" or "services" accordingly + - step: Vector creation of the Accessed Services + details: > + For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is found in the user's accessed services, or 0 if it is not + - step: Vector creation of the Recommended Services + details: > + For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is recommended to the user, or 0 if it is not + - step: Accuracy score computation + details: > + For each user compute the average value of the difference vector; a vector which states True if service is found in both accessed and recommended vectors or False if it is not + - step: Mean value of Accuracy score + details: > + Computation of the overall value by calculating the mean value of each user's accuracy score + +# This is optional for visual stylization of the metric when displayed on the report +style: + icon: pe-7s-arc + color: bg-night-sky diff --git a/metrics.py b/metrics.py index 8dd1f1f..b730a4b 100644 --- a/metrics.py +++ b/metrics.py @@ -609,25 +609,44 @@ def top5_services_ordered(object, k=5, base='https://marketplace.eosc-portal.eu' return topk_services -@statistic('A dictionary of the number of recommendations per day') -def recommendations_per_day(object): +@statistic('A dictionary of the number of recommended items per day') +def recommended_items_per_day(object): """ - It returns a statistical report in dictionary format. Specifically, the key - is set for each particular day found and its value contains the respective - number of recommendations committed. The dictionary includes all in-between - days (obviously, with the count set to zero). Recommendations are already - filtered by those where the user or service does not exist in users' or services' catalogs. + It returns a a timeseries of recommended item counts per day. Each timeseries item has two fields: date and value """ # count recommendations for each day found in entries res=object.recommendations.groupby(by=object.recommendations['Timestamp'].dt.date).count().iloc[:,0] - # fill the in between days with zero recommendations - res=res.asfreq('D', fill_value=0) + # create a Series with period's start and end times and value of 0 + init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()]) + + # remove duplicate entries for corner cases where start and end time match + init.drop_duplicates(keep='first', inplace=True) + + # append above two indexes and values (i.e. 0) to the Series + # with axis=1, same indexes are being merged + # since dataframe is created, get the first column + res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0] + # convert Nan values created by the concatenation to 0 + # and change data type back to int + res=res.fillna(0).astype(int) + + # fill the in between days with zero user_actions + res=res.asfreq('D', fill_value=0) + # convert datetimeindex to string res.index=res.index.format() - return res.to_dict() + # convert series to dataframe with extra column having the dates + res = res.to_frame().reset_index() + + # rename columns to date, value + res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True) + + # return a list of objects with date and value fields + return res.to_dict(orient='records') + @statistic('A dictionary of the number of user actions per day') def user_actions_per_day(object): @@ -644,10 +663,93 @@ def user_actions_per_day(object): # count user_actions for each day found in entries res=object.user_actions.groupby(by=object.user_actions['Timestamp'].dt.date).count().iloc[:,0] + # create a Series with period's start and end times and value of 0 + init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()]) + + # remove duplicate entries for corner cases where start and end time match + init.drop_duplicates(keep='first', inplace=True) + + # append above two indexes and values (i.e. 0) to the Series + # with axis=1, same indexes are being merged + # since dataframe is created, get the first column + res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0] + + # convert Nan values created by the concatenation to 0 + # and change data type back to int + res=res.fillna(0).astype(int) + # fill the in between days with zero user_actions res=res.asfreq('D', fill_value=0) - + # convert datetimeindex to string res.index=res.index.format() - return res.to_dict() + # convert series to dataframe with extra column having the dates + res = res.to_frame().reset_index() + + # rename columns to date, value + res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True) + + # return a list of objects with date and value fields + return res.to_dict(orient='records') + +@metric('The mean value of the accuracy score found for each user defined by the fraction of the number of the correct predictions by the total number of predictions') +def accuracy(object): + """ + Calculate the accuracy score found for each and retrieve the mean value. + The score is calculated by dividing the number of the correct predictions + by the total number of predictions. + """ + # a list of unique services' ids found in Datastore + services_list=object.services['Service'].unique().tolist() + # the length of the above value + len_services=services(object) + + def score(x): + """ + Inner function called at each row of the final dataframe + in order to calculate the accuracy score for each row (=user) + """ + # 'Services' header indicates the accessed services' list, + # while the 'Service' header indicates the recommended services' list + # if accessed or recommended services' list is empty + # it does not calculate any further computations + # else for each service found in services_list, + # put 1 or 0 if it is also found in the accessed or + # recommended services respectively + if not x['Services']: + true_values=np.array([0]*len_services) + else: + true_values=np.array(list(map(lambda s: 1 if s in x['Services'] else 0,services_list))) + if not x['Service']: + pred_values=np.array([0]*len_services) + else: + pred_values=np.array(list(map(lambda s: 1 if s in x['Service'] else 0,services_list))) + + # calculate the accuracy score by computing the average of the returned array + # The returned array is a True/False array when the respective element of true_values + # is equal or not to the respective element of pred_values + x['Services']=np.average(true_values==pred_values) + # return the row, where the 'Services' column has the accuracy score now + return x + + # a matrix of User ids and the respective accessed services' ids + access_df=object.users[['User','Services']] + + # a matrix of User ids and the respective recommended services' ids + rec_df=(object.recommendations[['User','Service']].groupby(['User']) + .agg({'Service': lambda x: x.unique().tolist()}) + .reset_index()) + + # performs a left join on User id, which means that nan values + # are set for cases where no recommendations were made + data=pd.merge(access_df, rec_df, on='User', how='left') + # convert nan values to zeros, in order to be handled easily by the inner function + data.fillna(0, inplace = True) + # apply the score function row-wise + data=data.apply(score, axis=1) + + # return the mean value of all users' accuracy score + # up to 4 digits precision + return round(data['Services'].mean(),4) + diff --git a/preprocessor.py b/preprocessor.py index e1bcea3..3882b1a 100755 --- a/preprocessor.py +++ b/preprocessor.py @@ -212,6 +212,9 @@ def __init__(self, source_page_id, target_page_id, order): _query=query.copy() _query['date'] = _query.pop('timestamp') for rec in recdb["recommendation"].find(_query).sort("user_id"): + # if dataset contains null references to user_ids replace them with the value -1 + if not rec["user_id"]: + rec["user_id"] = -1 recs.append({'user_id':int(rec['user_id']), 'resource_ids': list(map(lambda x: x['service_id'],rec['recommendation'])), 'resource_scores': list(map(lambda x: x['score'],rec['recommendation'])), @@ -224,8 +227,10 @@ def __init__(self, source_page_id, target_page_id, order): # store data to Mongo DB rsmetrics_db["user_actions"].delete_many({"provider":provider['name'], "ingestion":'batch'}) -rsmetrics_db["user_actions"].insert_many(luas) +if len(luas) > 0: + rsmetrics_db["user_actions"].insert_many(luas) rsmetrics_db["recommendations"].delete_many({"provider":provider['name'], "ingestion":'batch'}) -rsmetrics_db["recommendations"].insert_many(recs) +if len(recs) > 0: + rsmetrics_db["recommendations"].insert_many(recs) diff --git a/requirements.txt b/requirements.txt index b49446c..b7a16fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ beautifulsoup4==4.10.0 -certifi==2021.10.8 +certifi==2022.12.7 charset-normalizer==2.0.12 click==8.1.3 Flask==2.1.2 diff --git a/webservice/app.py b/webservice/app.py index 4b65a6f..ad0a458 100644 --- a/webservice/app.py +++ b/webservice/app.py @@ -102,7 +102,7 @@ def html_metrics(provider_name): result[stat_name] = get_statistic(provider_name, stat_name).get_json() metrics_needed = ['user_coverage', 'catalog_coverage', - 'diversity', 'diversity_gini', 'novelty'] + 'diversity', 'diversity_gini', 'novelty', 'accuracy'] for metric_name in metrics_needed: result[metric_name] = get_metric(provider_name, metric_name).get_json() @@ -142,6 +142,27 @@ def html_kpis(provider_name): return render_template('./kpis.html', data=result) +@app.route("/ui/reports//graphs", strict_slashes=False) +def html_graphs(provider_name): + '''Serve html page about graphs per provider''' + reports = db_get_provider_names() + if not provider_name in reports: + abort(404) + + result = {} + + stats_needed = ['start', 'end'] + for stat_name in stats_needed: + result[stat_name] = get_statistic(provider_name, stat_name).get_json() + + result['timestamp'] = get_api_index(provider_name).get_json()['timestamp'] + result['sidebar_info'] = app.sidebar_info + result['report'] = provider_name + result['reports'] = reports + result['metric_active'] = None + + return render_template('./graphs.html', data=result) + @app.route("/ui/descriptions/metrics/", strict_slashes=False) def html_metric_description(metric_name): diff --git a/webservice/templates/graphs.html b/webservice/templates/graphs.html new file mode 100644 index 0000000..6d2b8df --- /dev/null +++ b/webservice/templates/graphs.html @@ -0,0 +1,353 @@ + + + + + + + + + Graphs + + + + + + + + + + + +
+
+ +
+
+ +
+
+
+ + + +
+
+
+
Report:
+
+
+
+
+
+ + + + + +
+
+
+
+
+
+ + +
+
Graphs +
Graphic visualisations of various metrics +
+
+
+
+ {% set start_data = data.start.value.split('.')[0].split(' ') %} + {% set end_data = data.end.value.split('.')[0].split(' ') %} + Start Date: +
{{start_data[0]}}
+ End Date: +
{{end_data[0]}}
+
+
+
+
+
+
+
+
User actions per day
+
+
+ +
+
+ +
+
+
+
Recommended items per day
+
+
+ +
+
+
+ + + +
+ + +
+
+
+ + + + + diff --git a/webservice/templates/kpis.html b/webservice/templates/kpis.html index fbd2633..4d082f2 100644 --- a/webservice/templates/kpis.html +++ b/webservice/templates/kpis.html @@ -160,6 +160,12 @@ KPIs +
  • + + + Graphs + +
  • Metrics Documentation
  • {%for key, item in data.sidebar_info.metric_descriptions.items() | sort %}
  • @@ -387,18 +393,23 @@
    /{{item.orders.of_total}}
  • +
  • + + + Graphs + +
  • Metrics Documentation
  • {%for key, item in data.sidebar_info.metric_descriptions.items() | sort %}
  • @@ -297,18 +303,22 @@
    Process diff --git a/webservice/templates/rsmetrics.html b/webservice/templates/rsmetrics.html index 5b9c6d2..1bbe663 100644 --- a/webservice/templates/rsmetrics.html +++ b/webservice/templates/rsmetrics.html @@ -163,6 +163,12 @@ KPIs
  • +
  • + + + Graphs + +
  • Metrics Documentation
  • {%for key, item in data.sidebar_info.metric_descriptions.items() | sort %}
  • @@ -478,8 +484,6 @@

    Catalog Coverage - -
    @@ -530,11 +534,6 @@

    Diversity (Gini Index)
     
    - - - - -
    @@ -619,7 +618,48 @@

    Novelty

    +
    +
    +
    +
    +
    +
    + + +
    +
    +
    +
    +

    Accuracy + + + + + +

    +
    +
    +
    + +
    +
    +
    + {{data.accuracy.doc}} +
    +
    +
    +
    {{data.accuracy.value}} +
    +
    +
    + +
    +
    + +
    +
     
    @@ -629,22 +669,29 @@

    Novelty