diff --git a/Yapiki_publ/Public_solution_KI_YP.ipynb b/Yapiki_publ/Public_solution_KI_YP.ipynb
new file mode 100644
index 0000000..196a42e
--- /dev/null
+++ b/Yapiki_publ/Public_solution_KI_YP.ipynb
@@ -0,0 +1,3330 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "id": "78b262fa"
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import typing\n",
+ "import torch\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML\n",
+ "from lightautoml.tasks import Task\n",
+ "\n",
+ "import phik\n",
+ "from phik.report import plot_correlation_matrix\n",
+ "from phik import report"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "id": "5114ddf7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((279792, 77), (2974, 76))"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data = pd.read_csv('data/train.csv')\n",
+ "test_data = pd.read_csv('data/test.csv')\n",
+ "train_data.shape, test_data.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## EDA analisys"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Анализ отчет pandas_profiling (! big size file) https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for col in train_data.select_dtypes(include=np.number).columns:\n",
+ " train_data[col] = pd.to_numeric(train_data[col], downcast = 'unsigned')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " lat | \n",
+ " lng | \n",
+ " osm_amenity_points_in_0.001 | \n",
+ " osm_amenity_points_in_0.005 | \n",
+ " osm_amenity_points_in_0.0075 | \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " osm_building_points_in_0.001 | \n",
+ " osm_building_points_in_0.005 | \n",
+ " osm_building_points_in_0.0075 | \n",
+ " osm_building_points_in_0.01 | \n",
+ " ... | \n",
+ " reform_count_of_houses_500 | \n",
+ " reform_house_population_1000 | \n",
+ " reform_house_population_500 | \n",
+ " reform_mean_floor_count_1000 | \n",
+ " reform_mean_floor_count_500 | \n",
+ " reform_mean_year_building_1000 | \n",
+ " reform_mean_year_building_500 | \n",
+ " total_square | \n",
+ " realty_type | \n",
+ " price_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " ... | \n",
+ " 279792.000000 | \n",
+ " 265196.000000 | \n",
+ " 252558.000000 | \n",
+ " 263084.000000 | \n",
+ " 249624.000000 | \n",
+ " 263553.000000 | \n",
+ " 250155.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ " 279792.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 54.364078 | \n",
+ " 47.763540 | \n",
+ " 2.709084 | \n",
+ " 40.605146 | \n",
+ " 81.596171 | \n",
+ " 133.285458 | \n",
+ " 0.037442 | \n",
+ " 0.885701 | \n",
+ " 2.046467 | \n",
+ " 3.748163 | \n",
+ " ... | \n",
+ " 30.110661 | \n",
+ " 2042.541716 | \n",
+ " 644.610557 | \n",
+ " 7.051233 | \n",
+ " 7.360464 | \n",
+ " 1967.532599 | \n",
+ " 1967.988580 | \n",
+ " 507.833604 | \n",
+ " 54.974088 | \n",
+ " 0.016058 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 4.245713 | \n",
+ " 17.044625 | \n",
+ " 4.202451 | \n",
+ " 53.293388 | \n",
+ " 105.193169 | \n",
+ " 172.290136 | \n",
+ " 0.391014 | \n",
+ " 6.858338 | \n",
+ " 14.801566 | \n",
+ " 25.679859 | \n",
+ " ... | \n",
+ " 27.686234 | \n",
+ " 1359.884747 | \n",
+ " 445.699329 | \n",
+ " 3.542084 | \n",
+ " 4.231369 | \n",
+ " 45.807699 | \n",
+ " 54.110015 | \n",
+ " 1704.251771 | \n",
+ " 47.856417 | \n",
+ " 0.125700 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 42.651897 | \n",
+ " 19.892178 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 5.100000 | \n",
+ " 10.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 53.226600 | \n",
+ " 37.582988 | \n",
+ " 0.000000 | \n",
+ " 7.000000 | \n",
+ " 16.000000 | \n",
+ " 28.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 10.000000 | \n",
+ " 932.000000 | \n",
+ " 290.000000 | \n",
+ " 4.591837 | \n",
+ " 4.619959 | \n",
+ " 1960.070000 | \n",
+ " 1959.890097 | \n",
+ " 65.900000 | \n",
+ " 10.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 55.679090 | \n",
+ " 39.702435 | \n",
+ " 1.000000 | \n",
+ " 22.000000 | \n",
+ " 46.000000 | \n",
+ " 77.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 25.000000 | \n",
+ " 1949.000000 | \n",
+ " 602.000000 | \n",
+ " 6.368932 | \n",
+ " 6.395349 | \n",
+ " 1970.890411 | \n",
+ " 1971.647059 | \n",
+ " 128.737034 | \n",
+ " 10.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 56.306976 | \n",
+ " 55.957523 | \n",
+ " 4.000000 | \n",
+ " 51.000000 | \n",
+ " 101.000000 | \n",
+ " 164.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 2.000000 | \n",
+ " ... | \n",
+ " 43.000000 | \n",
+ " 2978.000000 | \n",
+ " 936.000000 | \n",
+ " 8.698925 | \n",
+ " 9.100000 | \n",
+ " 1983.701754 | \n",
+ " 1986.950000 | \n",
+ " 336.000000 | \n",
+ " 110.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 69.500740 | \n",
+ " 151.777000 | \n",
+ " 46.000000 | \n",
+ " 468.000000 | \n",
+ " 851.000000 | \n",
+ " 1392.000000 | \n",
+ " 30.000000 | \n",
+ " 586.000000 | \n",
+ " 949.000000 | \n",
+ " 1162.000000 | \n",
+ " ... | \n",
+ " 289.000000 | \n",
+ " 18392.000000 | \n",
+ " 6105.000000 | \n",
+ " 53.717949 | \n",
+ " 221.666667 | \n",
+ " 2019.000000 | \n",
+ " 2020.000000 | \n",
+ " 40000.000000 | \n",
+ " 110.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 70 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " lat lng osm_amenity_points_in_0.001 \\\n",
+ "count 279792.000000 279792.000000 279792.000000 \n",
+ "mean 54.364078 47.763540 2.709084 \n",
+ "std 4.245713 17.044625 4.202451 \n",
+ "min 42.651897 19.892178 0.000000 \n",
+ "25% 53.226600 37.582988 0.000000 \n",
+ "50% 55.679090 39.702435 1.000000 \n",
+ "75% 56.306976 55.957523 4.000000 \n",
+ "max 69.500740 151.777000 46.000000 \n",
+ "\n",
+ " osm_amenity_points_in_0.005 osm_amenity_points_in_0.0075 \\\n",
+ "count 279792.000000 279792.000000 \n",
+ "mean 40.605146 81.596171 \n",
+ "std 53.293388 105.193169 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 7.000000 16.000000 \n",
+ "50% 22.000000 46.000000 \n",
+ "75% 51.000000 101.000000 \n",
+ "max 468.000000 851.000000 \n",
+ "\n",
+ " osm_amenity_points_in_0.01 osm_building_points_in_0.001 \\\n",
+ "count 279792.000000 279792.000000 \n",
+ "mean 133.285458 0.037442 \n",
+ "std 172.290136 0.391014 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 28.000000 0.000000 \n",
+ "50% 77.000000 0.000000 \n",
+ "75% 164.000000 0.000000 \n",
+ "max 1392.000000 30.000000 \n",
+ "\n",
+ " osm_building_points_in_0.005 osm_building_points_in_0.0075 \\\n",
+ "count 279792.000000 279792.000000 \n",
+ "mean 0.885701 2.046467 \n",
+ "std 6.858338 14.801566 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 \n",
+ "75% 0.000000 1.000000 \n",
+ "max 586.000000 949.000000 \n",
+ "\n",
+ " osm_building_points_in_0.01 ... reform_count_of_houses_500 \\\n",
+ "count 279792.000000 ... 279792.000000 \n",
+ "mean 3.748163 ... 30.110661 \n",
+ "std 25.679859 ... 27.686234 \n",
+ "min 0.000000 ... 0.000000 \n",
+ "25% 0.000000 ... 10.000000 \n",
+ "50% 0.000000 ... 25.000000 \n",
+ "75% 2.000000 ... 43.000000 \n",
+ "max 1162.000000 ... 289.000000 \n",
+ "\n",
+ " reform_house_population_1000 reform_house_population_500 \\\n",
+ "count 265196.000000 252558.000000 \n",
+ "mean 2042.541716 644.610557 \n",
+ "std 1359.884747 445.699329 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 932.000000 290.000000 \n",
+ "50% 1949.000000 602.000000 \n",
+ "75% 2978.000000 936.000000 \n",
+ "max 18392.000000 6105.000000 \n",
+ "\n",
+ " reform_mean_floor_count_1000 reform_mean_floor_count_500 \\\n",
+ "count 263084.000000 249624.000000 \n",
+ "mean 7.051233 7.360464 \n",
+ "std 3.542084 4.231369 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 4.591837 4.619959 \n",
+ "50% 6.368932 6.395349 \n",
+ "75% 8.698925 9.100000 \n",
+ "max 53.717949 221.666667 \n",
+ "\n",
+ " reform_mean_year_building_1000 reform_mean_year_building_500 \\\n",
+ "count 263553.000000 250155.000000 \n",
+ "mean 1967.532599 1967.988580 \n",
+ "std 45.807699 54.110015 \n",
+ "min 1.000000 1.000000 \n",
+ "25% 1960.070000 1959.890097 \n",
+ "50% 1970.890411 1971.647059 \n",
+ "75% 1983.701754 1986.950000 \n",
+ "max 2019.000000 2020.000000 \n",
+ "\n",
+ " total_square realty_type price_type \n",
+ "count 279792.000000 279792.000000 279792.000000 \n",
+ "mean 507.833604 54.974088 0.016058 \n",
+ "std 1704.251771 47.856417 0.125700 \n",
+ "min 5.100000 10.000000 0.000000 \n",
+ "25% 65.900000 10.000000 0.000000 \n",
+ "50% 128.737034 10.000000 0.000000 \n",
+ "75% 336.000000 110.000000 0.000000 \n",
+ "max 40000.000000 110.000000 1.000000 \n",
+ "\n",
+ "[8 rows x 70 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def check_features(df):\n",
+ " return pd.DataFrame({'unique_values': df.nunique(),'type': df.dtypes,'pct_missing': df.isna().sum()/len(df) * 100}).sort_values(by = 'pct_missing', ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " floor | \n",
+ " reform_mean_floor_count_500 | \n",
+ " reform_mean_year_building_500 | \n",
+ " reform_house_population_500 | \n",
+ " reform_mean_floor_count_1000 | \n",
+ " reform_mean_year_building_1000 | \n",
+ " reform_house_population_1000 | \n",
+ " street | \n",
+ " osm_city_nearest_population | \n",
+ " region | \n",
+ " ... | \n",
+ " osm_finance_points_in_0.005 | \n",
+ " osm_finance_points_in_0.001 | \n",
+ " osm_culture_points_in_0.01 | \n",
+ " osm_culture_points_in_0.0075 | \n",
+ " osm_culture_points_in_0.005 | \n",
+ " osm_culture_points_in_0.001 | \n",
+ " osm_crossing_points_in_0.01 | \n",
+ " osm_crossing_points_in_0.0075 | \n",
+ " osm_crossing_points_in_0.005 | \n",
+ " price_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " unique_values | \n",
+ " 206 | \n",
+ " 17121 | \n",
+ " 43143 | \n",
+ " 2366 | \n",
+ " 49017 | \n",
+ " 76044 | \n",
+ " 6206 | \n",
+ " 28841 | \n",
+ " 169 | \n",
+ " 49 | \n",
+ " ... | \n",
+ " 29 | \n",
+ " 7 | \n",
+ " 216 | \n",
+ " 159 | \n",
+ " 111 | \n",
+ " 16 | \n",
+ " 268 | \n",
+ " 191 | \n",
+ " 108 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " type | \n",
+ " object | \n",
+ " float64 | \n",
+ " float64 | \n",
+ " float64 | \n",
+ " float64 | \n",
+ " float64 | \n",
+ " float64 | \n",
+ " object | \n",
+ " float64 | \n",
+ " object | \n",
+ " ... | \n",
+ " uint8 | \n",
+ " uint8 | \n",
+ " uint16 | \n",
+ " uint16 | \n",
+ " uint8 | \n",
+ " uint8 | \n",
+ " uint16 | \n",
+ " uint8 | \n",
+ " uint8 | \n",
+ " uint8 | \n",
+ "
\n",
+ " \n",
+ " pct_missing | \n",
+ " 62.9886 | \n",
+ " 10.7823 | \n",
+ " 10.5925 | \n",
+ " 9.73366 | \n",
+ " 5.97158 | \n",
+ " 5.80395 | \n",
+ " 5.21673 | \n",
+ " 0.573998 | \n",
+ " 0.0196575 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
3 rows × 77 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " floor reform_mean_floor_count_500 \\\n",
+ "unique_values 206 17121 \n",
+ "type object float64 \n",
+ "pct_missing 62.9886 10.7823 \n",
+ "\n",
+ " reform_mean_year_building_500 reform_house_population_500 \\\n",
+ "unique_values 43143 2366 \n",
+ "type float64 float64 \n",
+ "pct_missing 10.5925 9.73366 \n",
+ "\n",
+ " reform_mean_floor_count_1000 reform_mean_year_building_1000 \\\n",
+ "unique_values 49017 76044 \n",
+ "type float64 float64 \n",
+ "pct_missing 5.97158 5.80395 \n",
+ "\n",
+ " reform_house_population_1000 street \\\n",
+ "unique_values 6206 28841 \n",
+ "type float64 object \n",
+ "pct_missing 5.21673 0.573998 \n",
+ "\n",
+ " osm_city_nearest_population region ... \\\n",
+ "unique_values 169 49 ... \n",
+ "type float64 object ... \n",
+ "pct_missing 0.0196575 0 ... \n",
+ "\n",
+ " osm_finance_points_in_0.005 osm_finance_points_in_0.001 \\\n",
+ "unique_values 29 7 \n",
+ "type uint8 uint8 \n",
+ "pct_missing 0 0 \n",
+ "\n",
+ " osm_culture_points_in_0.01 osm_culture_points_in_0.0075 \\\n",
+ "unique_values 216 159 \n",
+ "type uint16 uint16 \n",
+ "pct_missing 0 0 \n",
+ "\n",
+ " osm_culture_points_in_0.005 osm_culture_points_in_0.001 \\\n",
+ "unique_values 111 16 \n",
+ "type uint8 uint8 \n",
+ "pct_missing 0 0 \n",
+ "\n",
+ " osm_crossing_points_in_0.01 osm_crossing_points_in_0.0075 \\\n",
+ "unique_values 268 191 \n",
+ "type uint16 uint8 \n",
+ "pct_missing 0 0 \n",
+ "\n",
+ " osm_crossing_points_in_0.005 price_type \n",
+ "unique_values 108 2 \n",
+ "type uint8 uint8 \n",
+ "pct_missing 0 0 \n",
+ "\n",
+ "[3 rows x 77 columns]"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "check_features(train_data).T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([nan, 3.0, 4.0, -1.0, 1.0, 14.0, 2.0, 8.0, -2.0, 6.0, 10.0, 5.0,\n",
+ " 16.0, 19.0, 7.0, 9.0, 58.0, 24.0, 18.0, 26.0, 17.0, 48.0, 11.0,\n",
+ " -3.0, 15.0, 22.0, 60.0, 12.0, 21.0, 35.0, 28.0, 38.0, 39.0, 13.0,\n",
+ " 81.0, 44.0, 82.0, 25.0, 45.0, 47.0, 23.0, 37.0, 29.0, 113.0, 78.0,\n",
+ " 42.0, 69.0, 27.0, 46.0, 53.0, 80.0, 70.0, 76.0, 64.0, 30.0, 73.0,\n",
+ " 77.0, 52.0, 67.0, 65.0, 20.0, 40.0, 49.0, 75.0, 93.0, 94.0, 91.0,\n",
+ " 72.0, 79.0, 84.0, 92.0, 33.0, 66.0, 90.0, 31.0, 36.0, 61.0, 71.0,\n",
+ " 68.0, 51.0, 97.0, 43.0, 95.0, 85.0, 50.0, 0.0, 62.0, 54.0, 74.0,\n",
+ " 57.0, 41.0, 34.0, 59.0, 56.0, 123.0, 55.0, 83.0, '27.0', '1.0',\n",
+ " '5.0', '-1.0', '67.0', '2.0', '0.0', '4.0', '6.0', '3.0', '15.0',\n",
+ " '10.0', '11.0', '30.0', '12.0', '-2.0', '14.0', '36.0', '8.0',\n",
+ " '50.0', '17.0', '19.0', '37.0', '68.0', '7.0', '42.0', '9.0',\n",
+ " '16.0', '20.0', '53.0', '91.0', '84.0', '38.0', '21.0', '48.0',\n",
+ " '22.0', '23.0', '1', '18.0', 'подвал, 1', '2', 'подвал',\n",
+ " 'цоколь, 1', '1,2,антресоль', 'цоколь', '4', '5', 'тех.этаж (6)',\n",
+ " '3', 'Подвал', 'Цоколь', '10', 'фактически на уровне 1 этажа', '6',\n",
+ " '1,2,3', '1, подвал', '1,2,3,4', '1,2', '1,2,3,4,5', '5, мансарда',\n",
+ " '1-й, подвал', '12', '15', '13', '1, подвал, антресоль', 'мезонин',\n",
+ " 'подвал, 1-3', '8', '7', '1 (Цокольный этаж)',\n",
+ " '3, Мансарда (4 эт)', 'подвал,1', '1, антресоль', '1-3',\n",
+ " 'мансарда (4эт)', '1, 2.', '9', 'подвал , 1 ', '1, 2',\n",
+ " 'подвал, 1,2,3', '1 + подвал (без отделки)', 'мансарда', '2,3',\n",
+ " '4, 5', '1-й, 2-й', '18', '1 этаж, подвал', '1, цоколь',\n",
+ " 'подвал, 1-7, техэтаж', '3 (антресоль)', '1, 2, 3',\n",
+ " 'Цоколь, 1,2(мансарда)', 'подвал, 3. 4 этаж', 'подвал, 1-4 этаж',\n",
+ " 'подва, 1.2 этаж', '2, 3', '-1', '1.2', '11', '36', '7,8',\n",
+ " '1 этаж', '1-й', '3 этаж', '4 этаж', '5 этаж', 'подвал,1,2,3,4,5',\n",
+ " '29', 'подвал, цоколь, 1 этаж', '3, мансарда'], dtype=object)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data['floor'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Вручную обработали столбец с этажами\n",
+ "for df in [train_data, test_data]:\n",
+ " df.replace('1', 1, inplace=True)\n",
+ " df.replace('1.0', 1, inplace=True)\n",
+ "\n",
+ "train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)\n",
+ "test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# вручную удалила повторяющиеся признаки с разным метражом, оставив только значение \"in 0.01\" для osm и \"1000\" для reform, чтобы матрица корреляции помещалась на экране\n",
+ "columns_for_pearson = [\n",
+ " 'per_square_meter_price', \n",
+ " 'city', 'floor',\n",
+ " 'osm_amenity_points_in_0.01', \n",
+ " 'osm_building_points_in_0.01', \n",
+ " 'osm_catering_points_in_0.01', \n",
+ " 'osm_city_closest_dist', \n",
+ " 'osm_city_nearest_name', \n",
+ " 'osm_city_nearest_population',\n",
+ " 'osm_crossing_closest_dist', \n",
+ " 'osm_crossing_points_in_0.01', \n",
+ " 'osm_culture_points_in_0.01',\n",
+ " 'osm_healthcare_points_in_0.01', \n",
+ " 'osm_historic_points_in_0.01', \n",
+ " 'osm_hotels_points_in_0.01',\n",
+ " 'osm_leisure_points_in_0.01', \n",
+ " 'osm_offices_points_in_0.01', \n",
+ " 'osm_shops_points_in_0.01', \n",
+ " 'osm_subway_closest_dist',\n",
+ " 'osm_train_stop_closest_dist', \n",
+ " 'osm_train_stop_points_in_0.01', \n",
+ " 'osm_transport_stop_closest_dist',\n",
+ " 'osm_transport_stop_points_in_0.01', \n",
+ " 'reform_count_of_houses_1000', \n",
+ " 'reform_house_population_1000',\n",
+ " 'reform_mean_floor_count_1000', \n",
+ " 'reform_mean_year_building_1000', \n",
+ " 'region', \n",
+ " 'total_square', \n",
+ " 'street', \n",
+ " 'date', \n",
+ " 'realty_type', \n",
+ " 'price_type']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " feature | \n",
+ " pearson | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " per_square_meter_price | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " osm_city_nearest_population | \n",
+ " 0.55 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " 0.48 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " osm_healthcare_points_in_0.01 | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " osm_catering_points_in_0.01 | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " osm_leisure_points_in_0.01 | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " osm_shops_points_in_0.01 | \n",
+ " 0.44 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " osm_transport_stop_points_in_0.01 | \n",
+ " 0.43 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " osm_crossing_points_in_0.01 | \n",
+ " 0.43 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " osm_offices_points_in_0.01 | \n",
+ " 0.42 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature pearson\n",
+ "0 per_square_meter_price 1.00\n",
+ "6 osm_city_nearest_population 0.55\n",
+ "2 osm_amenity_points_in_0.01 0.48\n",
+ "10 osm_healthcare_points_in_0.01 0.46\n",
+ "4 osm_catering_points_in_0.01 0.46\n",
+ "13 osm_leisure_points_in_0.01 0.46\n",
+ "15 osm_shops_points_in_0.01 0.44\n",
+ "20 osm_transport_stop_points_in_0.01 0.43\n",
+ "8 osm_crossing_points_in_0.01 0.43\n",
+ "14 osm_offices_points_in_0.01 0.42"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Находим признаки, имеющие максимальное значение коэффициента корреляции Пирсона с целевой переменной. \n",
+ "# В качестве отсечки выбрано значение 0.4 по модулю\n",
+ "pearson = train_data[columns_for_pearson].corr().round(2)\n",
+ "pearson_max_corr = (\n",
+ " pearson['per_square_meter_price'].to_frame().reset_index()\n",
+ " .rename(columns={'per_square_meter_price':'pearson', 'index':'feature'})\n",
+ " .sort_values(by='pearson', ascending=False)\n",
+ " .query('pearson > 0.4 or pearson < -0.4')\n",
+ " )\n",
+ "pearson_max_corr"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# fig, ax = plt.subplots(figsize=(20, 20))\n",
+ "# sns.heatmap(train_data[columns_for_pearson].corr().round(2), annot=True, square=True, cmap='mako')\n",
+ "# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $r$', fontdict={'fontsize': 15, 'fontweight': 'bold'})\n",
+ "# plt.show();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Высокая взаимная корреляция (> 0.8) у следующих переменных:\n",
+ "- amenity и catering, healthcare, office, shop\n",
+ "- catering и shop\n",
+ "- office и shop, catering\n",
+ "- healthcare и catering, office, shop,\n",
+ "- transport и crossing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# убрали city и street(закодирована)\n",
+ "\n",
+ "columns_for_phik = [\n",
+ " 'per_square_meter_price',\n",
+ " 'floor',\n",
+ " 'osm_amenity_points_in_0.01',\n",
+ " 'osm_building_points_in_0.01',\n",
+ " 'osm_catering_points_in_0.01',\n",
+ " 'osm_city_closest_dist',\n",
+ " 'osm_city_nearest_name',\n",
+ " 'osm_city_nearest_population',\n",
+ " 'osm_crossing_closest_dist',\n",
+ " 'osm_crossing_points_in_0.01',\n",
+ " 'osm_culture_points_in_0.01',\n",
+ " 'osm_healthcare_points_in_0.01',\n",
+ " 'osm_historic_points_in_0.01',\n",
+ " 'osm_hotels_points_in_0.01',\n",
+ " 'osm_leisure_points_in_0.01',\n",
+ " 'osm_offices_points_in_0.01',\n",
+ " 'osm_shops_points_in_0.01',\n",
+ " 'osm_subway_closest_dist',\n",
+ " 'osm_train_stop_closest_dist',\n",
+ " 'osm_train_stop_points_in_0.01',\n",
+ " 'osm_transport_stop_closest_dist',\n",
+ " 'osm_transport_stop_points_in_0.01',\n",
+ " 'reform_count_of_houses_1000',\n",
+ " 'reform_house_population_1000',\n",
+ " 'reform_mean_floor_count_1000',\n",
+ " 'reform_mean_year_building_1000',\n",
+ " 'region',\n",
+ " 'total_square',\n",
+ " 'date',\n",
+ " 'realty_type',\n",
+ " 'price_type']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " per_square_meter_price | \n",
+ " floor | \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " osm_building_points_in_0.01 | \n",
+ " osm_catering_points_in_0.01 | \n",
+ " osm_city_closest_dist | \n",
+ " osm_city_nearest_name | \n",
+ " osm_city_nearest_population | \n",
+ " osm_crossing_closest_dist | \n",
+ " osm_crossing_points_in_0.01 | \n",
+ " ... | \n",
+ " osm_transport_stop_points_in_0.01 | \n",
+ " reform_count_of_houses_1000 | \n",
+ " reform_house_population_1000 | \n",
+ " reform_mean_floor_count_1000 | \n",
+ " reform_mean_year_building_1000 | \n",
+ " region | \n",
+ " total_square | \n",
+ " date | \n",
+ " realty_type | \n",
+ " price_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " per_square_meter_price | \n",
+ " 1.00 | \n",
+ " 0.18 | \n",
+ " 0.48 | \n",
+ " 0.02 | \n",
+ " 0.48 | \n",
+ " 0.05 | \n",
+ " 0.57 | \n",
+ " 0.55 | \n",
+ " 0.00 | \n",
+ " 0.57 | \n",
+ " ... | \n",
+ " 0.46 | \n",
+ " 0.16 | \n",
+ " 0.19 | \n",
+ " 0.23 | \n",
+ " 0.03 | \n",
+ " 0.52 | \n",
+ " 0.02 | \n",
+ " 0.11 | \n",
+ " 0.18 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " floor | \n",
+ " 0.18 | \n",
+ " 1.00 | \n",
+ " 0.07 | \n",
+ " 0.01 | \n",
+ " 0.06 | \n",
+ " 0.08 | \n",
+ " 0.28 | \n",
+ " 0.15 | \n",
+ " 0.01 | \n",
+ " 0.14 | \n",
+ " ... | \n",
+ " 0.12 | \n",
+ " 0.03 | \n",
+ " 0.10 | \n",
+ " 0.15 | \n",
+ " 0.00 | \n",
+ " 0.23 | \n",
+ " 0.01 | \n",
+ " 0.22 | \n",
+ " 0.04 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " 0.48 | \n",
+ " 0.07 | \n",
+ " 1.00 | \n",
+ " 0.02 | \n",
+ " 0.96 | \n",
+ " 0.07 | \n",
+ " 0.56 | \n",
+ " 0.49 | \n",
+ " 0.00 | \n",
+ " 0.82 | \n",
+ " ... | \n",
+ " 0.69 | \n",
+ " 0.63 | \n",
+ " 0.62 | \n",
+ " 0.25 | \n",
+ " 0.04 | \n",
+ " 0.45 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ " 0.18 | \n",
+ " 0.22 | \n",
+ "
\n",
+ " \n",
+ " osm_building_points_in_0.01 | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 1.00 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " 0.28 | \n",
+ " 0.17 | \n",
+ " 0.00 | \n",
+ " 0.11 | \n",
+ " ... | \n",
+ " 0.04 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ " 0.03 | \n",
+ " 0.00 | \n",
+ " 0.22 | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.03 | \n",
+ " 0.01 | \n",
+ "
\n",
+ " \n",
+ " osm_catering_points_in_0.01 | \n",
+ " 0.48 | \n",
+ " 0.06 | \n",
+ " 0.96 | \n",
+ " 0.02 | \n",
+ " 1.00 | \n",
+ " 0.05 | \n",
+ " 0.51 | \n",
+ " 0.47 | \n",
+ " 0.00 | \n",
+ " 0.80 | \n",
+ " ... | \n",
+ " 0.63 | \n",
+ " 0.57 | \n",
+ " 0.60 | \n",
+ " 0.23 | \n",
+ " 0.03 | \n",
+ " 0.40 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ " 0.19 | \n",
+ " 0.20 | \n",
+ "
\n",
+ " \n",
+ " osm_city_closest_dist | \n",
+ " 0.05 | \n",
+ " 0.08 | \n",
+ " 0.07 | \n",
+ " 0.00 | \n",
+ " 0.05 | \n",
+ " 1.00 | \n",
+ " 0.80 | \n",
+ " 0.11 | \n",
+ " 0.73 | \n",
+ " 0.22 | \n",
+ " ... | \n",
+ " 0.15 | \n",
+ " 0.09 | \n",
+ " 0.11 | \n",
+ " 0.13 | \n",
+ " 0.00 | \n",
+ " 0.40 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.06 | \n",
+ " 0.01 | \n",
+ "
\n",
+ " \n",
+ " osm_city_nearest_name | \n",
+ " 0.57 | \n",
+ " 0.28 | \n",
+ " 0.56 | \n",
+ " 0.28 | \n",
+ " 0.51 | \n",
+ " 0.80 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 0.42 | \n",
+ " 0.64 | \n",
+ " ... | \n",
+ " 0.62 | \n",
+ " 0.55 | \n",
+ " 0.48 | \n",
+ " 0.59 | \n",
+ " 0.44 | \n",
+ " 1.00 | \n",
+ " 0.12 | \n",
+ " 0.22 | \n",
+ " 0.37 | \n",
+ " 0.30 | \n",
+ "
\n",
+ " \n",
+ " osm_city_nearest_population | \n",
+ " 0.55 | \n",
+ " 0.15 | \n",
+ " 0.49 | \n",
+ " 0.17 | \n",
+ " 0.47 | \n",
+ " 0.11 | \n",
+ " 1.00 | \n",
+ " 1.00 | \n",
+ " 0.00 | \n",
+ " 0.60 | \n",
+ " ... | \n",
+ " 0.49 | \n",
+ " 0.20 | \n",
+ " 0.24 | \n",
+ " 0.31 | \n",
+ " 0.05 | \n",
+ " 0.97 | \n",
+ " 0.07 | \n",
+ " 0.14 | \n",
+ " 0.14 | \n",
+ " 0.07 | \n",
+ "
\n",
+ " \n",
+ " osm_crossing_closest_dist | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.73 | \n",
+ " 0.42 | \n",
+ " 0.00 | \n",
+ " 1.00 | \n",
+ " 0.00 | \n",
+ " ... | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " 0.07 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " osm_crossing_points_in_0.01 | \n",
+ " 0.57 | \n",
+ " 0.14 | \n",
+ " 0.82 | \n",
+ " 0.11 | \n",
+ " 0.80 | \n",
+ " 0.22 | \n",
+ " 0.64 | \n",
+ " 0.60 | \n",
+ " 0.00 | \n",
+ " 1.00 | \n",
+ " ... | \n",
+ " 0.77 | \n",
+ " 0.67 | \n",
+ " 0.59 | \n",
+ " 0.41 | \n",
+ " 0.17 | \n",
+ " 0.55 | \n",
+ " 0.18 | \n",
+ " 0.17 | \n",
+ " 0.31 | \n",
+ " 0.09 | \n",
+ "
\n",
+ " \n",
+ " osm_culture_points_in_0.01 | \n",
+ " 0.31 | \n",
+ " 0.04 | \n",
+ " 0.61 | \n",
+ " 0.00 | \n",
+ " 0.60 | \n",
+ " 0.01 | \n",
+ " 0.46 | \n",
+ " 0.22 | \n",
+ " 0.00 | \n",
+ " 0.62 | \n",
+ " ... | \n",
+ " 0.34 | \n",
+ " 0.37 | \n",
+ " 0.15 | \n",
+ " 0.12 | \n",
+ " 0.00 | \n",
+ " 0.39 | \n",
+ " 0.04 | \n",
+ " 0.03 | \n",
+ " 0.07 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " osm_healthcare_points_in_0.01 | \n",
+ " 0.46 | \n",
+ " 0.10 | \n",
+ " 0.85 | \n",
+ " 0.02 | \n",
+ " 0.80 | \n",
+ " 0.07 | \n",
+ " 0.56 | \n",
+ " 0.49 | \n",
+ " 0.00 | \n",
+ " 0.81 | \n",
+ " ... | \n",
+ " 0.68 | \n",
+ " 0.53 | \n",
+ " 0.42 | \n",
+ " 0.23 | \n",
+ " 0.05 | \n",
+ " 0.46 | \n",
+ " 0.06 | \n",
+ " 0.07 | \n",
+ " 0.13 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " osm_historic_points_in_0.01 | \n",
+ " 0.38 | \n",
+ " 0.05 | \n",
+ " 0.75 | \n",
+ " 0.01 | \n",
+ " 0.73 | \n",
+ " 0.04 | \n",
+ " 0.45 | \n",
+ " 0.35 | \n",
+ " 0.00 | \n",
+ " 0.76 | \n",
+ " ... | \n",
+ " 0.52 | \n",
+ " 0.52 | \n",
+ " 0.19 | \n",
+ " 0.19 | \n",
+ " 0.04 | \n",
+ " 0.33 | \n",
+ " 0.08 | \n",
+ " 0.05 | \n",
+ " 0.14 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " osm_hotels_points_in_0.01 | \n",
+ " 0.37 | \n",
+ " 0.06 | \n",
+ " 0.88 | \n",
+ " 0.00 | \n",
+ " 0.88 | \n",
+ " 0.03 | \n",
+ " 0.44 | \n",
+ " 0.42 | \n",
+ " 0.00 | \n",
+ " 0.71 | \n",
+ " ... | \n",
+ " 0.50 | \n",
+ " 0.52 | \n",
+ " 0.56 | \n",
+ " 0.17 | \n",
+ " 0.01 | \n",
+ " 0.36 | \n",
+ " 0.05 | \n",
+ " 0.05 | \n",
+ " 0.08 | \n",
+ " 0.21 | \n",
+ "
\n",
+ " \n",
+ " osm_leisure_points_in_0.01 | \n",
+ " 0.47 | \n",
+ " 0.10 | \n",
+ " 0.72 | \n",
+ " 0.03 | \n",
+ " 0.68 | \n",
+ " 0.07 | \n",
+ " 0.57 | \n",
+ " 0.50 | \n",
+ " 0.00 | \n",
+ " 0.71 | \n",
+ " ... | \n",
+ " 0.58 | \n",
+ " 0.40 | \n",
+ " 0.39 | \n",
+ " 0.27 | \n",
+ " 0.05 | \n",
+ " 0.48 | \n",
+ " 0.06 | \n",
+ " 0.08 | \n",
+ " 0.14 | \n",
+ " 0.09 | \n",
+ "
\n",
+ " \n",
+ " osm_offices_points_in_0.01 | \n",
+ " 0.47 | \n",
+ " 0.08 | \n",
+ " 0.89 | \n",
+ " 0.02 | \n",
+ " 0.87 | \n",
+ " 0.05 | \n",
+ " 0.57 | \n",
+ " 0.52 | \n",
+ " 0.00 | \n",
+ " 0.76 | \n",
+ " ... | \n",
+ " 0.61 | \n",
+ " 0.56 | \n",
+ " 0.59 | \n",
+ " 0.27 | \n",
+ " 0.04 | \n",
+ " 0.47 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ " 0.19 | \n",
+ " 0.20 | \n",
+ "
\n",
+ " \n",
+ " osm_shops_points_in_0.01 | \n",
+ " 0.47 | \n",
+ " 0.07 | \n",
+ " 0.96 | \n",
+ " 0.02 | \n",
+ " 0.92 | \n",
+ " 0.07 | \n",
+ " 0.57 | \n",
+ " 0.49 | \n",
+ " 0.00 | \n",
+ " 0.79 | \n",
+ " ... | \n",
+ " 0.67 | \n",
+ " 0.64 | \n",
+ " 0.65 | \n",
+ " 0.25 | \n",
+ " 0.06 | \n",
+ " 0.47 | \n",
+ " 0.03 | \n",
+ " 0.07 | \n",
+ " 0.17 | \n",
+ " 0.20 | \n",
+ "
\n",
+ " \n",
+ " osm_subway_closest_dist | \n",
+ " 0.18 | \n",
+ " 0.09 | \n",
+ " 0.14 | \n",
+ " 0.12 | \n",
+ " 0.12 | \n",
+ " 0.41 | \n",
+ " 0.99 | \n",
+ " 0.31 | \n",
+ " 0.32 | \n",
+ " 0.20 | \n",
+ " ... | \n",
+ " 0.18 | \n",
+ " 0.12 | \n",
+ " 0.09 | \n",
+ " 0.15 | \n",
+ " 0.04 | \n",
+ " 0.91 | \n",
+ " 0.03 | \n",
+ " 0.06 | \n",
+ " 0.09 | \n",
+ " 0.04 | \n",
+ "
\n",
+ " \n",
+ " osm_train_stop_closest_dist | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.03 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.73 | \n",
+ " 0.88 | \n",
+ " 0.03 | \n",
+ " 0.79 | \n",
+ " 0.09 | \n",
+ " ... | \n",
+ " 0.06 | \n",
+ " 0.03 | \n",
+ " 0.05 | \n",
+ " 0.07 | \n",
+ " 0.00 | \n",
+ " 0.43 | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " osm_train_stop_points_in_0.01 | \n",
+ " 0.27 | \n",
+ " 0.02 | \n",
+ " 0.33 | \n",
+ " 0.00 | \n",
+ " 0.34 | \n",
+ " 0.01 | \n",
+ " 0.32 | \n",
+ " 0.29 | \n",
+ " 0.00 | \n",
+ " 0.46 | \n",
+ " ... | \n",
+ " 0.25 | \n",
+ " 0.12 | \n",
+ " 0.13 | \n",
+ " 0.12 | \n",
+ " 0.05 | \n",
+ " 0.26 | \n",
+ " 0.04 | \n",
+ " 0.04 | \n",
+ " 0.11 | \n",
+ " 0.03 | \n",
+ "
\n",
+ " \n",
+ " osm_transport_stop_closest_dist | \n",
+ " 0.00 | \n",
+ " 0.01 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.73 | \n",
+ " 0.25 | \n",
+ " 0.01 | \n",
+ " 0.88 | \n",
+ " 0.00 | \n",
+ " ... | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " 0.14 | \n",
+ " 0.00 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " osm_transport_stop_points_in_0.01 | \n",
+ " 0.46 | \n",
+ " 0.12 | \n",
+ " 0.69 | \n",
+ " 0.04 | \n",
+ " 0.63 | \n",
+ " 0.15 | \n",
+ " 0.62 | \n",
+ " 0.49 | \n",
+ " 0.02 | \n",
+ " 0.77 | \n",
+ " ... | \n",
+ " 1.00 | \n",
+ " 0.52 | \n",
+ " 0.45 | \n",
+ " 0.33 | \n",
+ " 0.05 | \n",
+ " 0.52 | \n",
+ " 0.05 | \n",
+ " 0.08 | \n",
+ " 0.20 | \n",
+ " 0.07 | \n",
+ "
\n",
+ " \n",
+ " reform_count_of_houses_1000 | \n",
+ " 0.16 | \n",
+ " 0.03 | \n",
+ " 0.63 | \n",
+ " 0.03 | \n",
+ " 0.57 | \n",
+ " 0.09 | \n",
+ " 0.55 | \n",
+ " 0.20 | \n",
+ " 0.01 | \n",
+ " 0.67 | \n",
+ " ... | \n",
+ " 0.52 | \n",
+ " 1.00 | \n",
+ " 0.55 | \n",
+ " 0.38 | \n",
+ " 0.06 | \n",
+ " 0.43 | \n",
+ " 0.05 | \n",
+ " 0.03 | \n",
+ " 0.13 | \n",
+ " 0.13 | \n",
+ "
\n",
+ " \n",
+ " reform_house_population_1000 | \n",
+ " 0.19 | \n",
+ " 0.10 | \n",
+ " 0.62 | \n",
+ " 0.03 | \n",
+ " 0.60 | \n",
+ " 0.11 | \n",
+ " 0.48 | \n",
+ " 0.24 | \n",
+ " 0.00 | \n",
+ " 0.59 | \n",
+ " ... | \n",
+ " 0.45 | \n",
+ " 0.55 | \n",
+ " 1.00 | \n",
+ " 0.31 | \n",
+ " 0.04 | \n",
+ " 0.36 | \n",
+ " 0.02 | \n",
+ " 0.04 | \n",
+ " 0.20 | \n",
+ " 0.16 | \n",
+ "
\n",
+ " \n",
+ " reform_mean_floor_count_1000 | \n",
+ " 0.23 | \n",
+ " 0.15 | \n",
+ " 0.25 | \n",
+ " 0.03 | \n",
+ " 0.23 | \n",
+ " 0.13 | \n",
+ " 0.59 | \n",
+ " 0.31 | \n",
+ " 0.01 | \n",
+ " 0.41 | \n",
+ " ... | \n",
+ " 0.33 | \n",
+ " 0.38 | \n",
+ " 0.31 | \n",
+ " 1.00 | \n",
+ " 0.08 | \n",
+ " 0.47 | \n",
+ " 0.02 | \n",
+ " 0.06 | \n",
+ " 0.14 | \n",
+ " 0.05 | \n",
+ "
\n",
+ " \n",
+ " reform_mean_year_building_1000 | \n",
+ " 0.03 | \n",
+ " 0.00 | \n",
+ " 0.04 | \n",
+ " 0.00 | \n",
+ " 0.03 | \n",
+ " 0.00 | \n",
+ " 0.44 | \n",
+ " 0.05 | \n",
+ " 0.00 | \n",
+ " 0.17 | \n",
+ " ... | \n",
+ " 0.05 | \n",
+ " 0.06 | \n",
+ " 0.04 | \n",
+ " 0.08 | \n",
+ " 1.00 | \n",
+ " 0.37 | \n",
+ " 0.00 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ "
\n",
+ " \n",
+ " region | \n",
+ " 0.52 | \n",
+ " 0.23 | \n",
+ " 0.45 | \n",
+ " 0.22 | \n",
+ " 0.40 | \n",
+ " 0.40 | \n",
+ " 1.00 | \n",
+ " 0.97 | \n",
+ " 0.07 | \n",
+ " 0.55 | \n",
+ " ... | \n",
+ " 0.52 | \n",
+ " 0.43 | \n",
+ " 0.36 | \n",
+ " 0.47 | \n",
+ " 0.37 | \n",
+ " 1.00 | \n",
+ " 0.08 | \n",
+ " 0.17 | \n",
+ " 0.28 | \n",
+ " 0.27 | \n",
+ "
\n",
+ " \n",
+ " total_square | \n",
+ " 0.02 | \n",
+ " 0.01 | \n",
+ " 0.05 | \n",
+ " 0.00 | \n",
+ " 0.05 | \n",
+ " 0.00 | \n",
+ " 0.12 | \n",
+ " 0.07 | \n",
+ " 0.00 | \n",
+ " 0.18 | \n",
+ " ... | \n",
+ " 0.05 | \n",
+ " 0.05 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 0.00 | \n",
+ " 0.08 | \n",
+ " 1.00 | \n",
+ " 0.02 | \n",
+ " 0.04 | \n",
+ " 0.01 | \n",
+ "
\n",
+ " \n",
+ " date | \n",
+ " 0.11 | \n",
+ " 0.22 | \n",
+ " 0.07 | \n",
+ " 0.01 | \n",
+ " 0.07 | \n",
+ " 0.02 | \n",
+ " 0.22 | \n",
+ " 0.14 | \n",
+ " 0.00 | \n",
+ " 0.17 | \n",
+ " ... | \n",
+ " 0.08 | \n",
+ " 0.03 | \n",
+ " 0.04 | \n",
+ " 0.06 | \n",
+ " 0.02 | \n",
+ " 0.17 | \n",
+ " 0.02 | \n",
+ " 1.00 | \n",
+ " 0.10 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " realty_type | \n",
+ " 0.18 | \n",
+ " 0.04 | \n",
+ " 0.18 | \n",
+ " 0.03 | \n",
+ " 0.19 | \n",
+ " 0.06 | \n",
+ " 0.37 | \n",
+ " 0.14 | \n",
+ " 0.01 | \n",
+ " 0.31 | \n",
+ " ... | \n",
+ " 0.20 | \n",
+ " 0.13 | \n",
+ " 0.20 | \n",
+ " 0.14 | \n",
+ " 0.02 | \n",
+ " 0.28 | \n",
+ " 0.04 | \n",
+ " 0.10 | \n",
+ " 1.00 | \n",
+ " 0.02 | \n",
+ "
\n",
+ " \n",
+ " price_type | \n",
+ " 0.05 | \n",
+ " 0.00 | \n",
+ " 0.22 | \n",
+ " 0.01 | \n",
+ " 0.20 | \n",
+ " 0.01 | \n",
+ " 0.30 | \n",
+ " 0.07 | \n",
+ " 0.00 | \n",
+ " 0.09 | \n",
+ " ... | \n",
+ " 0.07 | \n",
+ " 0.13 | \n",
+ " 0.16 | \n",
+ " 0.05 | \n",
+ " 0.00 | \n",
+ " 0.27 | \n",
+ " 0.01 | \n",
+ " 0.02 | \n",
+ " 0.02 | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
31 rows × 31 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " per_square_meter_price floor \\\n",
+ "per_square_meter_price 1.00 0.18 \n",
+ "floor 0.18 1.00 \n",
+ "osm_amenity_points_in_0.01 0.48 0.07 \n",
+ "osm_building_points_in_0.01 0.02 0.01 \n",
+ "osm_catering_points_in_0.01 0.48 0.06 \n",
+ "osm_city_closest_dist 0.05 0.08 \n",
+ "osm_city_nearest_name 0.57 0.28 \n",
+ "osm_city_nearest_population 0.55 0.15 \n",
+ "osm_crossing_closest_dist 0.00 0.01 \n",
+ "osm_crossing_points_in_0.01 0.57 0.14 \n",
+ "osm_culture_points_in_0.01 0.31 0.04 \n",
+ "osm_healthcare_points_in_0.01 0.46 0.10 \n",
+ "osm_historic_points_in_0.01 0.38 0.05 \n",
+ "osm_hotels_points_in_0.01 0.37 0.06 \n",
+ "osm_leisure_points_in_0.01 0.47 0.10 \n",
+ "osm_offices_points_in_0.01 0.47 0.08 \n",
+ "osm_shops_points_in_0.01 0.47 0.07 \n",
+ "osm_subway_closest_dist 0.18 0.09 \n",
+ "osm_train_stop_closest_dist 0.02 0.02 \n",
+ "osm_train_stop_points_in_0.01 0.27 0.02 \n",
+ "osm_transport_stop_closest_dist 0.00 0.01 \n",
+ "osm_transport_stop_points_in_0.01 0.46 0.12 \n",
+ "reform_count_of_houses_1000 0.16 0.03 \n",
+ "reform_house_population_1000 0.19 0.10 \n",
+ "reform_mean_floor_count_1000 0.23 0.15 \n",
+ "reform_mean_year_building_1000 0.03 0.00 \n",
+ "region 0.52 0.23 \n",
+ "total_square 0.02 0.01 \n",
+ "date 0.11 0.22 \n",
+ "realty_type 0.18 0.04 \n",
+ "price_type 0.05 0.00 \n",
+ "\n",
+ " osm_amenity_points_in_0.01 \\\n",
+ "per_square_meter_price 0.48 \n",
+ "floor 0.07 \n",
+ "osm_amenity_points_in_0.01 1.00 \n",
+ "osm_building_points_in_0.01 0.02 \n",
+ "osm_catering_points_in_0.01 0.96 \n",
+ "osm_city_closest_dist 0.07 \n",
+ "osm_city_nearest_name 0.56 \n",
+ "osm_city_nearest_population 0.49 \n",
+ "osm_crossing_closest_dist 0.00 \n",
+ "osm_crossing_points_in_0.01 0.82 \n",
+ "osm_culture_points_in_0.01 0.61 \n",
+ "osm_healthcare_points_in_0.01 0.85 \n",
+ "osm_historic_points_in_0.01 0.75 \n",
+ "osm_hotels_points_in_0.01 0.88 \n",
+ "osm_leisure_points_in_0.01 0.72 \n",
+ "osm_offices_points_in_0.01 0.89 \n",
+ "osm_shops_points_in_0.01 0.96 \n",
+ "osm_subway_closest_dist 0.14 \n",
+ "osm_train_stop_closest_dist 0.03 \n",
+ "osm_train_stop_points_in_0.01 0.33 \n",
+ "osm_transport_stop_closest_dist 0.00 \n",
+ "osm_transport_stop_points_in_0.01 0.69 \n",
+ "reform_count_of_houses_1000 0.63 \n",
+ "reform_house_population_1000 0.62 \n",
+ "reform_mean_floor_count_1000 0.25 \n",
+ "reform_mean_year_building_1000 0.04 \n",
+ "region 0.45 \n",
+ "total_square 0.05 \n",
+ "date 0.07 \n",
+ "realty_type 0.18 \n",
+ "price_type 0.22 \n",
+ "\n",
+ " osm_building_points_in_0.01 \\\n",
+ "per_square_meter_price 0.02 \n",
+ "floor 0.01 \n",
+ "osm_amenity_points_in_0.01 0.02 \n",
+ "osm_building_points_in_0.01 1.00 \n",
+ "osm_catering_points_in_0.01 0.02 \n",
+ "osm_city_closest_dist 0.00 \n",
+ "osm_city_nearest_name 0.28 \n",
+ "osm_city_nearest_population 0.17 \n",
+ "osm_crossing_closest_dist 0.00 \n",
+ "osm_crossing_points_in_0.01 0.11 \n",
+ "osm_culture_points_in_0.01 0.00 \n",
+ "osm_healthcare_points_in_0.01 0.02 \n",
+ "osm_historic_points_in_0.01 0.01 \n",
+ "osm_hotels_points_in_0.01 0.00 \n",
+ "osm_leisure_points_in_0.01 0.03 \n",
+ "osm_offices_points_in_0.01 0.02 \n",
+ "osm_shops_points_in_0.01 0.02 \n",
+ "osm_subway_closest_dist 0.12 \n",
+ "osm_train_stop_closest_dist 0.00 \n",
+ "osm_train_stop_points_in_0.01 0.00 \n",
+ "osm_transport_stop_closest_dist 0.00 \n",
+ "osm_transport_stop_points_in_0.01 0.04 \n",
+ "reform_count_of_houses_1000 0.03 \n",
+ "reform_house_population_1000 0.03 \n",
+ "reform_mean_floor_count_1000 0.03 \n",
+ "reform_mean_year_building_1000 0.00 \n",
+ "region 0.22 \n",
+ "total_square 0.00 \n",
+ "date 0.01 \n",
+ "realty_type 0.03 \n",
+ "price_type 0.01 \n",
+ "\n",
+ " osm_catering_points_in_0.01 \\\n",
+ "per_square_meter_price 0.48 \n",
+ "floor 0.06 \n",
+ "osm_amenity_points_in_0.01 0.96 \n",
+ "osm_building_points_in_0.01 0.02 \n",
+ "osm_catering_points_in_0.01 1.00 \n",
+ "osm_city_closest_dist 0.05 \n",
+ "osm_city_nearest_name 0.51 \n",
+ "osm_city_nearest_population 0.47 \n",
+ "osm_crossing_closest_dist 0.00 \n",
+ "osm_crossing_points_in_0.01 0.80 \n",
+ "osm_culture_points_in_0.01 0.60 \n",
+ "osm_healthcare_points_in_0.01 0.80 \n",
+ "osm_historic_points_in_0.01 0.73 \n",
+ "osm_hotels_points_in_0.01 0.88 \n",
+ "osm_leisure_points_in_0.01 0.68 \n",
+ "osm_offices_points_in_0.01 0.87 \n",
+ "osm_shops_points_in_0.01 0.92 \n",
+ "osm_subway_closest_dist 0.12 \n",
+ "osm_train_stop_closest_dist 0.02 \n",
+ "osm_train_stop_points_in_0.01 0.34 \n",
+ "osm_transport_stop_closest_dist 0.00 \n",
+ "osm_transport_stop_points_in_0.01 0.63 \n",
+ "reform_count_of_houses_1000 0.57 \n",
+ "reform_house_population_1000 0.60 \n",
+ "reform_mean_floor_count_1000 0.23 \n",
+ "reform_mean_year_building_1000 0.03 \n",
+ "region 0.40 \n",
+ "total_square 0.05 \n",
+ "date 0.07 \n",
+ "realty_type 0.19 \n",
+ "price_type 0.20 \n",
+ "\n",
+ " osm_city_closest_dist \\\n",
+ "per_square_meter_price 0.05 \n",
+ "floor 0.08 \n",
+ "osm_amenity_points_in_0.01 0.07 \n",
+ "osm_building_points_in_0.01 0.00 \n",
+ "osm_catering_points_in_0.01 0.05 \n",
+ "osm_city_closest_dist 1.00 \n",
+ "osm_city_nearest_name 0.80 \n",
+ "osm_city_nearest_population 0.11 \n",
+ "osm_crossing_closest_dist 0.73 \n",
+ "osm_crossing_points_in_0.01 0.22 \n",
+ "osm_culture_points_in_0.01 0.01 \n",
+ "osm_healthcare_points_in_0.01 0.07 \n",
+ "osm_historic_points_in_0.01 0.04 \n",
+ "osm_hotels_points_in_0.01 0.03 \n",
+ "osm_leisure_points_in_0.01 0.07 \n",
+ "osm_offices_points_in_0.01 0.05 \n",
+ "osm_shops_points_in_0.01 0.07 \n",
+ "osm_subway_closest_dist 0.41 \n",
+ "osm_train_stop_closest_dist 0.73 \n",
+ "osm_train_stop_points_in_0.01 0.01 \n",
+ "osm_transport_stop_closest_dist 0.73 \n",
+ "osm_transport_stop_points_in_0.01 0.15 \n",
+ "reform_count_of_houses_1000 0.09 \n",
+ "reform_house_population_1000 0.11 \n",
+ "reform_mean_floor_count_1000 0.13 \n",
+ "reform_mean_year_building_1000 0.00 \n",
+ "region 0.40 \n",
+ "total_square 0.00 \n",
+ "date 0.02 \n",
+ "realty_type 0.06 \n",
+ "price_type 0.01 \n",
+ "\n",
+ " osm_city_nearest_name \\\n",
+ "per_square_meter_price 0.57 \n",
+ "floor 0.28 \n",
+ "osm_amenity_points_in_0.01 0.56 \n",
+ "osm_building_points_in_0.01 0.28 \n",
+ "osm_catering_points_in_0.01 0.51 \n",
+ "osm_city_closest_dist 0.80 \n",
+ "osm_city_nearest_name 1.00 \n",
+ "osm_city_nearest_population 1.00 \n",
+ "osm_crossing_closest_dist 0.42 \n",
+ "osm_crossing_points_in_0.01 0.64 \n",
+ "osm_culture_points_in_0.01 0.46 \n",
+ "osm_healthcare_points_in_0.01 0.56 \n",
+ "osm_historic_points_in_0.01 0.45 \n",
+ "osm_hotels_points_in_0.01 0.44 \n",
+ "osm_leisure_points_in_0.01 0.57 \n",
+ "osm_offices_points_in_0.01 0.57 \n",
+ "osm_shops_points_in_0.01 0.57 \n",
+ "osm_subway_closest_dist 0.99 \n",
+ "osm_train_stop_closest_dist 0.88 \n",
+ "osm_train_stop_points_in_0.01 0.32 \n",
+ "osm_transport_stop_closest_dist 0.25 \n",
+ "osm_transport_stop_points_in_0.01 0.62 \n",
+ "reform_count_of_houses_1000 0.55 \n",
+ "reform_house_population_1000 0.48 \n",
+ "reform_mean_floor_count_1000 0.59 \n",
+ "reform_mean_year_building_1000 0.44 \n",
+ "region 1.00 \n",
+ "total_square 0.12 \n",
+ "date 0.22 \n",
+ "realty_type 0.37 \n",
+ "price_type 0.30 \n",
+ "\n",
+ " osm_city_nearest_population \\\n",
+ "per_square_meter_price 0.55 \n",
+ "floor 0.15 \n",
+ "osm_amenity_points_in_0.01 0.49 \n",
+ "osm_building_points_in_0.01 0.17 \n",
+ "osm_catering_points_in_0.01 0.47 \n",
+ "osm_city_closest_dist 0.11 \n",
+ "osm_city_nearest_name 1.00 \n",
+ "osm_city_nearest_population 1.00 \n",
+ "osm_crossing_closest_dist 0.00 \n",
+ "osm_crossing_points_in_0.01 0.60 \n",
+ "osm_culture_points_in_0.01 0.22 \n",
+ "osm_healthcare_points_in_0.01 0.49 \n",
+ "osm_historic_points_in_0.01 0.35 \n",
+ "osm_hotels_points_in_0.01 0.42 \n",
+ "osm_leisure_points_in_0.01 0.50 \n",
+ "osm_offices_points_in_0.01 0.52 \n",
+ "osm_shops_points_in_0.01 0.49 \n",
+ "osm_subway_closest_dist 0.31 \n",
+ "osm_train_stop_closest_dist 0.03 \n",
+ "osm_train_stop_points_in_0.01 0.29 \n",
+ "osm_transport_stop_closest_dist 0.01 \n",
+ "osm_transport_stop_points_in_0.01 0.49 \n",
+ "reform_count_of_houses_1000 0.20 \n",
+ "reform_house_population_1000 0.24 \n",
+ "reform_mean_floor_count_1000 0.31 \n",
+ "reform_mean_year_building_1000 0.05 \n",
+ "region 0.97 \n",
+ "total_square 0.07 \n",
+ "date 0.14 \n",
+ "realty_type 0.14 \n",
+ "price_type 0.07 \n",
+ "\n",
+ " osm_crossing_closest_dist \\\n",
+ "per_square_meter_price 0.00 \n",
+ "floor 0.01 \n",
+ "osm_amenity_points_in_0.01 0.00 \n",
+ "osm_building_points_in_0.01 0.00 \n",
+ "osm_catering_points_in_0.01 0.00 \n",
+ "osm_city_closest_dist 0.73 \n",
+ "osm_city_nearest_name 0.42 \n",
+ "osm_city_nearest_population 0.00 \n",
+ "osm_crossing_closest_dist 1.00 \n",
+ "osm_crossing_points_in_0.01 0.00 \n",
+ "osm_culture_points_in_0.01 0.00 \n",
+ "osm_healthcare_points_in_0.01 0.00 \n",
+ "osm_historic_points_in_0.01 0.00 \n",
+ "osm_hotels_points_in_0.01 0.00 \n",
+ "osm_leisure_points_in_0.01 0.00 \n",
+ "osm_offices_points_in_0.01 0.00 \n",
+ "osm_shops_points_in_0.01 0.00 \n",
+ "osm_subway_closest_dist 0.32 \n",
+ "osm_train_stop_closest_dist 0.79 \n",
+ "osm_train_stop_points_in_0.01 0.00 \n",
+ "osm_transport_stop_closest_dist 0.88 \n",
+ "osm_transport_stop_points_in_0.01 0.02 \n",
+ "reform_count_of_houses_1000 0.01 \n",
+ "reform_house_population_1000 0.00 \n",
+ "reform_mean_floor_count_1000 0.01 \n",
+ "reform_mean_year_building_1000 0.00 \n",
+ "region 0.07 \n",
+ "total_square 0.00 \n",
+ "date 0.00 \n",
+ "realty_type 0.01 \n",
+ "price_type 0.00 \n",
+ "\n",
+ " osm_crossing_points_in_0.01 ... \\\n",
+ "per_square_meter_price 0.57 ... \n",
+ "floor 0.14 ... \n",
+ "osm_amenity_points_in_0.01 0.82 ... \n",
+ "osm_building_points_in_0.01 0.11 ... \n",
+ "osm_catering_points_in_0.01 0.80 ... \n",
+ "osm_city_closest_dist 0.22 ... \n",
+ "osm_city_nearest_name 0.64 ... \n",
+ "osm_city_nearest_population 0.60 ... \n",
+ "osm_crossing_closest_dist 0.00 ... \n",
+ "osm_crossing_points_in_0.01 1.00 ... \n",
+ "osm_culture_points_in_0.01 0.62 ... \n",
+ "osm_healthcare_points_in_0.01 0.81 ... \n",
+ "osm_historic_points_in_0.01 0.76 ... \n",
+ "osm_hotels_points_in_0.01 0.71 ... \n",
+ "osm_leisure_points_in_0.01 0.71 ... \n",
+ "osm_offices_points_in_0.01 0.76 ... \n",
+ "osm_shops_points_in_0.01 0.79 ... \n",
+ "osm_subway_closest_dist 0.20 ... \n",
+ "osm_train_stop_closest_dist 0.09 ... \n",
+ "osm_train_stop_points_in_0.01 0.46 ... \n",
+ "osm_transport_stop_closest_dist 0.00 ... \n",
+ "osm_transport_stop_points_in_0.01 0.77 ... \n",
+ "reform_count_of_houses_1000 0.67 ... \n",
+ "reform_house_population_1000 0.59 ... \n",
+ "reform_mean_floor_count_1000 0.41 ... \n",
+ "reform_mean_year_building_1000 0.17 ... \n",
+ "region 0.55 ... \n",
+ "total_square 0.18 ... \n",
+ "date 0.17 ... \n",
+ "realty_type 0.31 ... \n",
+ "price_type 0.09 ... \n",
+ "\n",
+ " osm_transport_stop_points_in_0.01 \\\n",
+ "per_square_meter_price 0.46 \n",
+ "floor 0.12 \n",
+ "osm_amenity_points_in_0.01 0.69 \n",
+ "osm_building_points_in_0.01 0.04 \n",
+ "osm_catering_points_in_0.01 0.63 \n",
+ "osm_city_closest_dist 0.15 \n",
+ "osm_city_nearest_name 0.62 \n",
+ "osm_city_nearest_population 0.49 \n",
+ "osm_crossing_closest_dist 0.02 \n",
+ "osm_crossing_points_in_0.01 0.77 \n",
+ "osm_culture_points_in_0.01 0.34 \n",
+ "osm_healthcare_points_in_0.01 0.68 \n",
+ "osm_historic_points_in_0.01 0.52 \n",
+ "osm_hotels_points_in_0.01 0.50 \n",
+ "osm_leisure_points_in_0.01 0.58 \n",
+ "osm_offices_points_in_0.01 0.61 \n",
+ "osm_shops_points_in_0.01 0.67 \n",
+ "osm_subway_closest_dist 0.18 \n",
+ "osm_train_stop_closest_dist 0.06 \n",
+ "osm_train_stop_points_in_0.01 0.25 \n",
+ "osm_transport_stop_closest_dist 0.02 \n",
+ "osm_transport_stop_points_in_0.01 1.00 \n",
+ "reform_count_of_houses_1000 0.52 \n",
+ "reform_house_population_1000 0.45 \n",
+ "reform_mean_floor_count_1000 0.33 \n",
+ "reform_mean_year_building_1000 0.05 \n",
+ "region 0.52 \n",
+ "total_square 0.05 \n",
+ "date 0.08 \n",
+ "realty_type 0.20 \n",
+ "price_type 0.07 \n",
+ "\n",
+ " reform_count_of_houses_1000 \\\n",
+ "per_square_meter_price 0.16 \n",
+ "floor 0.03 \n",
+ "osm_amenity_points_in_0.01 0.63 \n",
+ "osm_building_points_in_0.01 0.03 \n",
+ "osm_catering_points_in_0.01 0.57 \n",
+ "osm_city_closest_dist 0.09 \n",
+ "osm_city_nearest_name 0.55 \n",
+ "osm_city_nearest_population 0.20 \n",
+ "osm_crossing_closest_dist 0.01 \n",
+ "osm_crossing_points_in_0.01 0.67 \n",
+ "osm_culture_points_in_0.01 0.37 \n",
+ "osm_healthcare_points_in_0.01 0.53 \n",
+ "osm_historic_points_in_0.01 0.52 \n",
+ "osm_hotels_points_in_0.01 0.52 \n",
+ "osm_leisure_points_in_0.01 0.40 \n",
+ "osm_offices_points_in_0.01 0.56 \n",
+ "osm_shops_points_in_0.01 0.64 \n",
+ "osm_subway_closest_dist 0.12 \n",
+ "osm_train_stop_closest_dist 0.03 \n",
+ "osm_train_stop_points_in_0.01 0.12 \n",
+ "osm_transport_stop_closest_dist 0.01 \n",
+ "osm_transport_stop_points_in_0.01 0.52 \n",
+ "reform_count_of_houses_1000 1.00 \n",
+ "reform_house_population_1000 0.55 \n",
+ "reform_mean_floor_count_1000 0.38 \n",
+ "reform_mean_year_building_1000 0.06 \n",
+ "region 0.43 \n",
+ "total_square 0.05 \n",
+ "date 0.03 \n",
+ "realty_type 0.13 \n",
+ "price_type 0.13 \n",
+ "\n",
+ " reform_house_population_1000 \\\n",
+ "per_square_meter_price 0.19 \n",
+ "floor 0.10 \n",
+ "osm_amenity_points_in_0.01 0.62 \n",
+ "osm_building_points_in_0.01 0.03 \n",
+ "osm_catering_points_in_0.01 0.60 \n",
+ "osm_city_closest_dist 0.11 \n",
+ "osm_city_nearest_name 0.48 \n",
+ "osm_city_nearest_population 0.24 \n",
+ "osm_crossing_closest_dist 0.00 \n",
+ "osm_crossing_points_in_0.01 0.59 \n",
+ "osm_culture_points_in_0.01 0.15 \n",
+ "osm_healthcare_points_in_0.01 0.42 \n",
+ "osm_historic_points_in_0.01 0.19 \n",
+ "osm_hotels_points_in_0.01 0.56 \n",
+ "osm_leisure_points_in_0.01 0.39 \n",
+ "osm_offices_points_in_0.01 0.59 \n",
+ "osm_shops_points_in_0.01 0.65 \n",
+ "osm_subway_closest_dist 0.09 \n",
+ "osm_train_stop_closest_dist 0.05 \n",
+ "osm_train_stop_points_in_0.01 0.13 \n",
+ "osm_transport_stop_closest_dist 0.01 \n",
+ "osm_transport_stop_points_in_0.01 0.45 \n",
+ "reform_count_of_houses_1000 0.55 \n",
+ "reform_house_population_1000 1.00 \n",
+ "reform_mean_floor_count_1000 0.31 \n",
+ "reform_mean_year_building_1000 0.04 \n",
+ "region 0.36 \n",
+ "total_square 0.02 \n",
+ "date 0.04 \n",
+ "realty_type 0.20 \n",
+ "price_type 0.16 \n",
+ "\n",
+ " reform_mean_floor_count_1000 \\\n",
+ "per_square_meter_price 0.23 \n",
+ "floor 0.15 \n",
+ "osm_amenity_points_in_0.01 0.25 \n",
+ "osm_building_points_in_0.01 0.03 \n",
+ "osm_catering_points_in_0.01 0.23 \n",
+ "osm_city_closest_dist 0.13 \n",
+ "osm_city_nearest_name 0.59 \n",
+ "osm_city_nearest_population 0.31 \n",
+ "osm_crossing_closest_dist 0.01 \n",
+ "osm_crossing_points_in_0.01 0.41 \n",
+ "osm_culture_points_in_0.01 0.12 \n",
+ "osm_healthcare_points_in_0.01 0.23 \n",
+ "osm_historic_points_in_0.01 0.19 \n",
+ "osm_hotels_points_in_0.01 0.17 \n",
+ "osm_leisure_points_in_0.01 0.27 \n",
+ "osm_offices_points_in_0.01 0.27 \n",
+ "osm_shops_points_in_0.01 0.25 \n",
+ "osm_subway_closest_dist 0.15 \n",
+ "osm_train_stop_closest_dist 0.07 \n",
+ "osm_train_stop_points_in_0.01 0.12 \n",
+ "osm_transport_stop_closest_dist 0.02 \n",
+ "osm_transport_stop_points_in_0.01 0.33 \n",
+ "reform_count_of_houses_1000 0.38 \n",
+ "reform_house_population_1000 0.31 \n",
+ "reform_mean_floor_count_1000 1.00 \n",
+ "reform_mean_year_building_1000 0.08 \n",
+ "region 0.47 \n",
+ "total_square 0.02 \n",
+ "date 0.06 \n",
+ "realty_type 0.14 \n",
+ "price_type 0.05 \n",
+ "\n",
+ " reform_mean_year_building_1000 region \\\n",
+ "per_square_meter_price 0.03 0.52 \n",
+ "floor 0.00 0.23 \n",
+ "osm_amenity_points_in_0.01 0.04 0.45 \n",
+ "osm_building_points_in_0.01 0.00 0.22 \n",
+ "osm_catering_points_in_0.01 0.03 0.40 \n",
+ "osm_city_closest_dist 0.00 0.40 \n",
+ "osm_city_nearest_name 0.44 1.00 \n",
+ "osm_city_nearest_population 0.05 0.97 \n",
+ "osm_crossing_closest_dist 0.00 0.07 \n",
+ "osm_crossing_points_in_0.01 0.17 0.55 \n",
+ "osm_culture_points_in_0.01 0.00 0.39 \n",
+ "osm_healthcare_points_in_0.01 0.05 0.46 \n",
+ "osm_historic_points_in_0.01 0.04 0.33 \n",
+ "osm_hotels_points_in_0.01 0.01 0.36 \n",
+ "osm_leisure_points_in_0.01 0.05 0.48 \n",
+ "osm_offices_points_in_0.01 0.04 0.47 \n",
+ "osm_shops_points_in_0.01 0.06 0.47 \n",
+ "osm_subway_closest_dist 0.04 0.91 \n",
+ "osm_train_stop_closest_dist 0.00 0.43 \n",
+ "osm_train_stop_points_in_0.01 0.05 0.26 \n",
+ "osm_transport_stop_closest_dist 0.00 0.14 \n",
+ "osm_transport_stop_points_in_0.01 0.05 0.52 \n",
+ "reform_count_of_houses_1000 0.06 0.43 \n",
+ "reform_house_population_1000 0.04 0.36 \n",
+ "reform_mean_floor_count_1000 0.08 0.47 \n",
+ "reform_mean_year_building_1000 1.00 0.37 \n",
+ "region 0.37 1.00 \n",
+ "total_square 0.00 0.08 \n",
+ "date 0.02 0.17 \n",
+ "realty_type 0.02 0.28 \n",
+ "price_type 0.00 0.27 \n",
+ "\n",
+ " total_square date realty_type price_type \n",
+ "per_square_meter_price 0.02 0.11 0.18 0.05 \n",
+ "floor 0.01 0.22 0.04 0.00 \n",
+ "osm_amenity_points_in_0.01 0.05 0.07 0.18 0.22 \n",
+ "osm_building_points_in_0.01 0.00 0.01 0.03 0.01 \n",
+ "osm_catering_points_in_0.01 0.05 0.07 0.19 0.20 \n",
+ "osm_city_closest_dist 0.00 0.02 0.06 0.01 \n",
+ "osm_city_nearest_name 0.12 0.22 0.37 0.30 \n",
+ "osm_city_nearest_population 0.07 0.14 0.14 0.07 \n",
+ "osm_crossing_closest_dist 0.00 0.00 0.01 0.00 \n",
+ "osm_crossing_points_in_0.01 0.18 0.17 0.31 0.09 \n",
+ "osm_culture_points_in_0.01 0.04 0.03 0.07 0.04 \n",
+ "osm_healthcare_points_in_0.01 0.06 0.07 0.13 0.04 \n",
+ "osm_historic_points_in_0.01 0.08 0.05 0.14 0.05 \n",
+ "osm_hotels_points_in_0.01 0.05 0.05 0.08 0.21 \n",
+ "osm_leisure_points_in_0.01 0.06 0.08 0.14 0.09 \n",
+ "osm_offices_points_in_0.01 0.05 0.07 0.19 0.20 \n",
+ "osm_shops_points_in_0.01 0.03 0.07 0.17 0.20 \n",
+ "osm_subway_closest_dist 0.03 0.06 0.09 0.04 \n",
+ "osm_train_stop_closest_dist 0.00 0.01 0.02 0.00 \n",
+ "osm_train_stop_points_in_0.01 0.04 0.04 0.11 0.03 \n",
+ "osm_transport_stop_closest_dist 0.00 0.00 0.02 0.00 \n",
+ "osm_transport_stop_points_in_0.01 0.05 0.08 0.20 0.07 \n",
+ "reform_count_of_houses_1000 0.05 0.03 0.13 0.13 \n",
+ "reform_house_population_1000 0.02 0.04 0.20 0.16 \n",
+ "reform_mean_floor_count_1000 0.02 0.06 0.14 0.05 \n",
+ "reform_mean_year_building_1000 0.00 0.02 0.02 0.00 \n",
+ "region 0.08 0.17 0.28 0.27 \n",
+ "total_square 1.00 0.02 0.04 0.01 \n",
+ "date 0.02 1.00 0.10 0.02 \n",
+ "realty_type 0.04 0.10 1.00 0.02 \n",
+ "price_type 0.01 0.02 0.02 1.00 \n",
+ "\n",
+ "[31 rows x 31 columns]"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# считаем корреляцию phik - она позволяет находитб взаимосвязи в том числе между категориальными переменными. \n",
+ "# направление взаимосвязи не видно, только абсолютное значение. Чем ближе к единице, тем лучше\n",
+ "\n",
+ "# выделяем интервальные переменные\n",
+ "interval_cols = ['osm_amenity_points_in_0.01', 'osm_building_points_in_0.01',\n",
+ " 'osm_catering_points_in_0.01', 'osm_city_closest_dist',\n",
+ " 'osm_city_nearest_population','osm_crossing_closest_dist', 'osm_crossing_points_in_0.001', 'osm_culture_points_in_0.01',\n",
+ " 'osm_healthcare_points_in_0.01', 'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.01',\n",
+ " 'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.01',\n",
+ " 'osm_shops_points_in_0.01', 'osm_subway_closest_dist',\n",
+ " 'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.01',\n",
+ " 'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.01', 'per_square_meter_price',\n",
+ " 'reform_count_of_houses_1000', 'reform_house_population_1000', \n",
+ " 'reform_mean_floor_count_1000', 'reform_mean_year_building_1000',\n",
+ " 'total_square', 'realty_type', 'price_type', 'many_floors',\n",
+ " 'city', 'street']\n",
+ "\n",
+ "# строим матрицу корреляции\n",
+ "phik_overview = train_data[columns_for_phik].phik_matrix(interval_cols=interval_cols)\n",
+ "\n",
+ "phik_overview.round(2)\n",
+ "\n",
+ "# визуализируем с помощью тепловой карты\n",
+ "# fig, ax = plt.subplots(figsize=(20, 20))\n",
+ "# sns.heatmap(phik_overview.round(2), annot=True, square=True, cmap='mako')\n",
+ "# ax.set_title(label = 'МАТРИЦА КОРРЕЛЯЦИИ ПРИЗНАКОВ $\\phi_K$', fontdict={'fontsize': 15, 'fontweight': 'bold'})\n",
+ "\n",
+ "# plt.tight_layout()\n",
+ "# plt.show();"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " feature | \n",
+ " phik | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " per_square_meter_price | \n",
+ " 1.00 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " osm_crossing_points_in_0.01 | \n",
+ " 0.57 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " osm_city_nearest_name | \n",
+ " 0.57 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " osm_city_nearest_population | \n",
+ " 0.55 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " region | \n",
+ " 0.52 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " 0.48 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " osm_catering_points_in_0.01 | \n",
+ " 0.48 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " osm_leisure_points_in_0.01 | \n",
+ " 0.47 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " osm_offices_points_in_0.01 | \n",
+ " 0.47 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " osm_shops_points_in_0.01 | \n",
+ " 0.47 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " osm_healthcare_points_in_0.01 | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " osm_transport_stop_points_in_0.01 | \n",
+ " 0.46 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature phik\n",
+ "0 per_square_meter_price 1.00\n",
+ "9 osm_crossing_points_in_0.01 0.57\n",
+ "6 osm_city_nearest_name 0.57\n",
+ "7 osm_city_nearest_population 0.55\n",
+ "26 region 0.52\n",
+ "2 osm_amenity_points_in_0.01 0.48\n",
+ "4 osm_catering_points_in_0.01 0.48\n",
+ "14 osm_leisure_points_in_0.01 0.47\n",
+ "15 osm_offices_points_in_0.01 0.47\n",
+ "16 osm_shops_points_in_0.01 0.47\n",
+ "11 osm_healthcare_points_in_0.01 0.46\n",
+ "21 osm_transport_stop_points_in_0.01 0.46"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Формируем список переменных с максимальным коэффициентов корреляции phik с целевой переменной \n",
+ "phik_max_corr = (\n",
+ " phik_overview['per_square_meter_price'].to_frame().reset_index()\n",
+ " .rename(columns={'per_square_meter_price':'phik', 'index':'feature'})\n",
+ " .sort_values(by='phik', ascending=False)\n",
+ " .query('phik > 0.4')\n",
+ " .round(2)\n",
+ " )\n",
+ "phik_max_corr\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Матрица корреляции phik показала максимальную взаимосвязь целевой переменной с пременными:\n",
+ "- количество пешеходных переходов в радиусе 1 км\n",
+ "- название ближайшего города\n",
+ "- население ближайшего города\n",
+ "- регион\n",
+ "- количество в радиусе 1 км точек кейтеринга, досуга, офисов, магазинов, медучреждений, остановок общественного транспорта и объектов связаннных с удобством\n",
+ "\n",
+ "Также обнаружены новые взаимосвязи переменных между собой, которые не были видны на матрице корреляции Пирсона:\n",
+ "- этаж и price_type\n",
+ "- название близлежащего города и население ближайшего города, расстояние до ближайшего метро, остановки общественного транспорта, регион\n",
+ "- регион и расстояние до ближайшего метро"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Modelling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add features\n",
+ "city_population = pd.read_csv('city_population.csv')\n",
+ "zarplaty = pd.read_excel('zarplaty.xlsx')\n",
+ "\n",
+ "def city_type(row):\n",
+ " if row >=1000000:\n",
+ " return \"1Million\"\n",
+ " elif (row<1000000)&(row >200000):\n",
+ " return \"Medium\"\n",
+ " elif (row <=200000):\n",
+ " return \"Small\"\n",
+ " \n",
+ "def floor_type(row):\n",
+ " if ('1' in str(row))&(row!=-1):\n",
+ " return 1\n",
+ " else:\n",
+ " return 0\n",
+ " \n",
+ "def add_features(df):\n",
+ " df['age'] = round(2021 - df['reform_mean_year_building_500'])\n",
+ " df.city = df.city.apply(lambda x: x.lower())\n",
+ " \n",
+ " city_population_clean = city_population.groupby('settlement').agg({'population':'sum'}).reset_index()\n",
+ " city_population_clean.columns = ['city', 'city_population']\n",
+ " city_population_clean['city_population']\n",
+ " city_population_clean.city = city_population_clean.city.apply(lambda x: x.lower())\n",
+ " df = df.merge(city_population_clean, on = 'city', how='left')\n",
+ " \n",
+ " for col in df.select_dtypes(include=np.number).columns:\n",
+ " df[col] = pd.to_numeric(df[col], downcast = 'unsigned')\n",
+ " \n",
+ " df['city_type'] = df['city_population'].apply(lambda x: city_type(x))\n",
+ " df.loc[df.city == 'москва', 'city_type'] = \"Capital\"\n",
+ " df.loc[df.city == 'санкт-Петербург', 'city_type'] = \"Capital\"\n",
+ " \n",
+ " df = df.merge(zarplaty, on = 'region', how='left')\n",
+ " df['zarplata'] = pd.to_numeric(df['zarplata'], downcast = 'unsigned')\n",
+ " df['floor_type'] = df['floor'].apply(lambda x: floor_type(x))\n",
+ " \n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data = add_features(train_data)\n",
+ "test_data = add_features(test_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((279967, 82), (2974, 81))"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data.shape, test_data.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " city | \n",
+ " floor | \n",
+ " id | \n",
+ " lat | \n",
+ " lng | \n",
+ " osm_amenity_points_in_0.001 | \n",
+ " osm_amenity_points_in_0.005 | \n",
+ " osm_amenity_points_in_0.0075 | \n",
+ " osm_amenity_points_in_0.01 | \n",
+ " osm_building_points_in_0.001 | \n",
+ " ... | \n",
+ " total_square | \n",
+ " street | \n",
+ " date | \n",
+ " realty_type | \n",
+ " price_type | \n",
+ " age | \n",
+ " city_population | \n",
+ " city_type | \n",
+ " zarplata | \n",
+ " floor_type | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " пермь | \n",
+ " 0 | \n",
+ " COL_0 | \n",
+ " 57.998207 | \n",
+ " 56.292797 | \n",
+ " 4 | \n",
+ " 19 | \n",
+ " 35 | \n",
+ " 52 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 32.0 | \n",
+ " S27289 | \n",
+ " 2020-01-05 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 60.0 | \n",
+ " 1048011.0 | \n",
+ " 1Million | \n",
+ " 41958.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " шатура | \n",
+ " 0 | \n",
+ " COL_1 | \n",
+ " 55.574284 | \n",
+ " 39.543835 | \n",
+ " 3 | \n",
+ " 24 | \n",
+ " 37 | \n",
+ " 59 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 280.0 | \n",
+ " S17052 | \n",
+ " 2020-01-05 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 64.0 | \n",
+ " 32885.0 | \n",
+ " Small | \n",
+ " 58066.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ярославль | \n",
+ " 0 | \n",
+ " COL_2 | \n",
+ " 57.619140 | \n",
+ " 39.850525 | \n",
+ " 1 | \n",
+ " 30 | \n",
+ " 67 | \n",
+ " 128 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 297.4 | \n",
+ " S16913 | \n",
+ " 2020-01-05 | \n",
+ " 110 | \n",
+ " 0 | \n",
+ " 48.0 | \n",
+ " 604128.0 | \n",
+ " Medium | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " новокузнецк | \n",
+ " 0 | \n",
+ " COL_3 | \n",
+ " 53.897083 | \n",
+ " 87.108604 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 190.0 | \n",
+ " S10148 | \n",
+ " 2020-01-05 | \n",
+ " 110 | \n",
+ " 0 | \n",
+ " 7.0 | \n",
+ " 551919.0 | \n",
+ " Medium | \n",
+ " 43429.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " москва | \n",
+ " 0 | \n",
+ " COL_4 | \n",
+ " 55.802590 | \n",
+ " 37.487110 | \n",
+ " 1 | \n",
+ " 23 | \n",
+ " 64 | \n",
+ " 153 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 60.2 | \n",
+ " S1338 | \n",
+ " 2020-01-05 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 60.0 | \n",
+ " 12380691.0 | \n",
+ " Capital | \n",
+ " 100070.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 82 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " city floor id lat lng \\\n",
+ "0 пермь 0 COL_0 57.998207 56.292797 \n",
+ "1 шатура 0 COL_1 55.574284 39.543835 \n",
+ "2 ярославль 0 COL_2 57.619140 39.850525 \n",
+ "3 новокузнецк 0 COL_3 53.897083 87.108604 \n",
+ "4 москва 0 COL_4 55.802590 37.487110 \n",
+ "\n",
+ " osm_amenity_points_in_0.001 osm_amenity_points_in_0.005 \\\n",
+ "0 4 19 \n",
+ "1 3 24 \n",
+ "2 1 30 \n",
+ "3 0 0 \n",
+ "4 1 23 \n",
+ "\n",
+ " osm_amenity_points_in_0.0075 osm_amenity_points_in_0.01 \\\n",
+ "0 35 52 \n",
+ "1 37 59 \n",
+ "2 67 128 \n",
+ "3 5 21 \n",
+ "4 64 153 \n",
+ "\n",
+ " osm_building_points_in_0.001 ... total_square street date \\\n",
+ "0 0 ... 32.0 S27289 2020-01-05 \n",
+ "1 0 ... 280.0 S17052 2020-01-05 \n",
+ "2 0 ... 297.4 S16913 2020-01-05 \n",
+ "3 0 ... 190.0 S10148 2020-01-05 \n",
+ "4 0 ... 60.2 S1338 2020-01-05 \n",
+ "\n",
+ " realty_type price_type age city_population city_type zarplata \\\n",
+ "0 10 0 60.0 1048011.0 1Million 41958.0 \n",
+ "1 10 0 64.0 32885.0 Small 58066.0 \n",
+ "2 110 0 48.0 604128.0 Medium NaN \n",
+ "3 110 0 7.0 551919.0 Medium 43429.0 \n",
+ "4 10 0 60.0 12380691.0 Capital 100070.0 \n",
+ "\n",
+ " floor_type \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "\n",
+ "[5 rows x 82 columns]"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data = train_data.query('price_type == 1')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_data.replace('1', 1, inplace=True)\n",
+ "train_data.replace('1.0', 1, inplace=True)\n",
+ "test_data.replace('1', 1, inplace=True)\n",
+ "test_data.replace('1.0', 1, inplace=True)\n",
+ "\n",
+ "train_data['floor'] = train_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)\n",
+ "test_data['floor'] = test_data.apply(lambda row: 1 if row['floor'] == 1 else 0, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "id": "811f6b6d"
+ },
+ "outputs": [],
+ "source": [
+ "N_THREADS = 4 # threads cnt for lgbm and linear models\n",
+ "N_FOLDS = 5 # folds cnt for AutoML\n",
+ "RANDOM_STATE = 42 # fixed random state for various reasons\n",
+ "#TEST_SIZE = 0.1 # Test size for metric check\n",
+ "TIMEOUT = 100 # Time in seconds for automl run USE TIMEOUT = 1700 for perfect score\n",
+ "\n",
+ "np.random.seed(RANDOM_STATE)\n",
+ "torch.set_num_threads(N_THREADS)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "id": "e6d51e72"
+ },
+ "outputs": [],
+ "source": [
+ "# train_df, test_df = train_test_split(train_data, \n",
+ "# test_size=TEST_SIZE, \n",
+ "# random_state=RANDOM_STATE)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "THRESHOLD = 0.15\n",
+ "NEGATIVE_WEIGHT = 1.1\n",
+ "\n",
+ "def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:\n",
+ " \"\"\"\n",
+ " Реализация кастомной метрики для хакатона.\n",
+ "\n",
+ " :param y_true: float, реальная цена\n",
+ " :param y_pred: float, предсказанная цена\n",
+ " :return: float, значение метрики\n",
+ " \"\"\"\n",
+ " deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)\n",
+ " if np.abs(deviation) <= THRESHOLD:\n",
+ " return 0\n",
+ " elif deviation <= - 4 * THRESHOLD:\n",
+ " return 9 * NEGATIVE_WEIGHT\n",
+ " elif deviation < -THRESHOLD:\n",
+ " return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2\n",
+ " elif deviation < 4 * THRESHOLD:\n",
+ " return ((deviation / THRESHOLD) - 1) ** 2\n",
+ " else:\n",
+ " return 9\n",
+ "\n",
+ "\n",
+ "def deviation_metric(y_true: np.array, y_pred: np.array) -> float:\n",
+ " return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "id": "11901a60"
+ },
+ "outputs": [],
+ "source": [
+ "task = Task('reg', loss = 'rmsle', metric = deviation_metric)\n",
+ "\n",
+ "roles = {\n",
+ " 'target': 'per_square_meter_price',\n",
+ " 'drop': 'id'\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 800
+ },
+ "id": "c77216e8",
+ "outputId": "ae6a7952-b341-40ed-f000-00f38639be74"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}\n",
+ "Found reader_params in kwargs, need to combine\n",
+ "Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}\n",
+ "Start automl preset with listed constraints:\n",
+ "- time: 100.0 seconds\n",
+ "- cpus: 4 cores\n",
+ "- memory: 16 gb\n",
+ "\n",
+ "Train data shape: (4493, 82)\n",
+ "Feats was rejected during automatic roles guess: []\n",
+ "\n",
+ "\n",
+ "Layer 1 ...\n",
+ "Train process start. Time left 89.13144850730896 secs\n",
+ "Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...\n",
+ "\n",
+ "===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n",
+ "\n",
+ "Linear model: C = 1e-05 score = [-3.7152774]\n",
+ "Linear model: C = 5e-05 score = [-3.7152774]\n",
+ "Linear model: C = 0.0001 score = [-3.7152774]\n",
+ "\n",
+ "===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n",
+ "\n",
+ "Linear model: C = 1e-05 score = [-3.3165922]\n",
+ "Linear model: C = 5e-05 score = [-3.3165922]\n",
+ "Linear model: C = 0.0001 score = [-3.3165922]\n",
+ "\n",
+ "===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n",
+ "\n",
+ "Linear model: C = 1e-05 score = [-3.6910217]\n",
+ "Linear model: C = 5e-05 score = [-3.6910217]\n",
+ "Linear model: C = 0.0001 score = [-3.6910217]\n",
+ "\n",
+ "===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n",
+ "\n",
+ "Linear model: C = 1e-05 score = [-3.6205108]\n",
+ "Linear model: C = 5e-05 score = [-3.6205108]\n",
+ "Linear model: C = 0.0001 score = [-3.6205108]\n",
+ "\n",
+ "===== Start working with fold 4 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====\n",
+ "\n",
+ "Linear model: C = 1e-05 score = [-3.2158222]\n",
+ "Linear model: C = 5e-05 score = [-3.2158222]\n",
+ "Linear model: C = 0.0001 score = [-3.2158222]\n",
+ "Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed\n",
+ "Time left 86.83387517929077\n",
+ "Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...\n",
+ "\n",
+ "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====\n",
+ "\n",
+ "Training until validation scores don't improve for 200 rounds\n",
+ "[100]\tvalid's l2: 0.175464\tvalid's Opt metric: 2.38008\n",
+ "[200]\tvalid's l2: 0.137539\tvalid's Opt metric: 2.00419\n",
+ "[300]\tvalid's l2: 0.126562\tvalid's Opt metric: 1.85852\n",
+ "[400]\tvalid's l2: 0.122176\tvalid's Opt metric: 1.77832\n",
+ "[500]\tvalid's l2: 0.120404\tvalid's Opt metric: 1.74085\n",
+ "[600]\tvalid's l2: 0.119403\tvalid's Opt metric: 1.71643\n",
+ "[700]\tvalid's l2: 0.118939\tvalid's Opt metric: 1.7027\n",
+ "[800]\tvalid's l2: 0.1189\tvalid's Opt metric: 1.69712\n",
+ "[900]\tvalid's l2: 0.118543\tvalid's Opt metric: 1.68706\n",
+ "[1000]\tvalid's l2: 0.11832\tvalid's Opt metric: 1.68094\n",
+ "[1100]\tvalid's l2: 0.118228\tvalid's Opt metric: 1.67885\n",
+ "[1200]\tvalid's l2: 0.118142\tvalid's Opt metric: 1.67488\n",
+ "[1300]\tvalid's l2: 0.117963\tvalid's Opt metric: 1.66822\n",
+ "[1400]\tvalid's l2: 0.117946\tvalid's Opt metric: 1.66472\n",
+ "[1500]\tvalid's l2: 0.118028\tvalid's Opt metric: 1.66498\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Time limit exceeded after calculating fold 0\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Early stopping, best iteration is:\n",
+ "[1336]\tvalid's l2: 0.117866\tvalid's Opt metric: 1.6654\n",
+ "Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed\n",
+ "Start fitting Lvl_0_Pipe_1_Mod_2_CatBoost ...\n",
+ "\n",
+ "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5690497\ttest: 0.5902307\tbest: 0.5902307 (0)\ttotal: 9.89ms\tremaining: 19.8s\n",
+ "100:\tlearn: 0.3220134\ttest: 0.3603304\tbest: 0.3603304 (100)\ttotal: 241ms\tremaining: 4.53s\n",
+ "200:\tlearn: 0.2853510\ttest: 0.3517094\tbest: 0.3517094 (200)\ttotal: 498ms\tremaining: 4.46s\n",
+ "300:\tlearn: 0.2576051\ttest: 0.3490464\tbest: 0.3490393 (299)\ttotal: 737ms\tremaining: 4.16s\n",
+ "400:\tlearn: 0.2379688\ttest: 0.3474727\tbest: 0.3474727 (400)\ttotal: 1s\tremaining: 4s\n",
+ "500:\tlearn: 0.2222526\ttest: 0.3449390\tbest: 0.3449390 (500)\ttotal: 1.23s\tremaining: 3.67s\n",
+ "600:\tlearn: 0.2078858\ttest: 0.3447732\tbest: 0.3443742 (549)\ttotal: 1.49s\tremaining: 3.46s\n",
+ "700:\tlearn: 0.1960459\ttest: 0.3442549\tbest: 0.3441099 (652)\ttotal: 1.75s\tremaining: 3.24s\n",
+ "800:\tlearn: 0.1857116\ttest: 0.3438032\tbest: 0.3436641 (740)\ttotal: 2s\tremaining: 2.99s\n",
+ "900:\tlearn: 0.1759209\ttest: 0.3433300\tbest: 0.3432126 (844)\ttotal: 2.22s\tremaining: 2.71s\n",
+ "1000:\tlearn: 0.1668783\ttest: 0.3428658\tbest: 0.3428565 (990)\ttotal: 2.47s\tremaining: 2.46s\n",
+ "1100:\tlearn: 0.1586743\ttest: 0.3419130\tbest: 0.3418887 (1090)\ttotal: 2.69s\tremaining: 2.2s\n",
+ "1200:\tlearn: 0.1510258\ttest: 0.3414564\tbest: 0.3414564 (1200)\ttotal: 2.94s\tremaining: 1.96s\n",
+ "1300:\tlearn: 0.1441223\ttest: 0.3414049\tbest: 0.3409616 (1261)\ttotal: 3.18s\tremaining: 1.71s\n",
+ "1400:\tlearn: 0.1377195\ttest: 0.3414637\tbest: 0.3409616 (1261)\ttotal: 3.44s\tremaining: 1.47s\n",
+ "1500:\tlearn: 0.1316497\ttest: 0.3414028\tbest: 0.3409616 (1261)\ttotal: 3.67s\tremaining: 1.22s\n",
+ "Stopped by overfitting detector (300 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3409616327\n",
+ "bestIteration = 1261\n",
+ "\n",
+ "Shrink model to first 1262 iterations.\n",
+ "\n",
+ "===== Start working with fold 1 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5768234\ttest: 0.5574083\tbest: 0.5574083 (0)\ttotal: 3.21ms\tremaining: 6.42s\n",
+ "100:\tlearn: 0.3247511\ttest: 0.3336189\tbest: 0.3336189 (100)\ttotal: 246ms\tremaining: 4.62s\n",
+ "200:\tlearn: 0.2892813\ttest: 0.3252905\tbest: 0.3252353 (193)\ttotal: 536ms\tremaining: 4.79s\n",
+ "300:\tlearn: 0.2614113\ttest: 0.3229165\tbest: 0.3227958 (299)\ttotal: 767ms\tremaining: 4.33s\n",
+ "400:\tlearn: 0.2413630\ttest: 0.3221904\tbest: 0.3219651 (324)\ttotal: 1.02s\tremaining: 4.08s\n",
+ "500:\tlearn: 0.2248640\ttest: 0.3214037\tbest: 0.3211377 (457)\ttotal: 1.27s\tremaining: 3.81s\n",
+ "600:\tlearn: 0.2108310\ttest: 0.3208134\tbest: 0.3207517 (597)\ttotal: 1.56s\tremaining: 3.62s\n",
+ "700:\tlearn: 0.1988404\ttest: 0.3209918\tbest: 0.3207120 (654)\ttotal: 1.8s\tremaining: 3.33s\n",
+ "800:\tlearn: 0.1880008\ttest: 0.3214166\tbest: 0.3207120 (654)\ttotal: 2.05s\tremaining: 3.06s\n",
+ "900:\tlearn: 0.1778877\ttest: 0.3214287\tbest: 0.3207120 (654)\ttotal: 2.35s\tremaining: 2.86s\n",
+ "Stopped by overfitting detector (300 iterations wait)\n",
+ "\n",
+ "bestTest = 0.320711959\n",
+ "bestIteration = 654\n",
+ "\n",
+ "Shrink model to first 655 iterations.\n",
+ "\n",
+ "===== Start working with fold 2 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5724626\ttest: 0.5753481\tbest: 0.5753481 (0)\ttotal: 3.17ms\tremaining: 6.33s\n",
+ "100:\tlearn: 0.3239383\ttest: 0.3399819\tbest: 0.3399819 (100)\ttotal: 266ms\tremaining: 5s\n",
+ "200:\tlearn: 0.2867294\ttest: 0.3285307\tbest: 0.3285307 (200)\ttotal: 512ms\tremaining: 4.58s\n",
+ "300:\tlearn: 0.2589989\ttest: 0.3255054\tbest: 0.3254546 (298)\ttotal: 750ms\tremaining: 4.23s\n",
+ "400:\tlearn: 0.2388187\ttest: 0.3247287\tbest: 0.3245618 (398)\ttotal: 976ms\tremaining: 3.89s\n",
+ "500:\tlearn: 0.2222920\ttest: 0.3240440\tbest: 0.3240057 (499)\ttotal: 1.24s\tremaining: 3.7s\n",
+ "600:\tlearn: 0.2086809\ttest: 0.3226891\tbest: 0.3226891 (600)\ttotal: 1.49s\tremaining: 3.46s\n",
+ "700:\tlearn: 0.1960510\ttest: 0.3210332\tbest: 0.3210024 (698)\ttotal: 1.74s\tremaining: 3.22s\n",
+ "800:\tlearn: 0.1854055\ttest: 0.3204972\tbest: 0.3204972 (800)\ttotal: 2s\tremaining: 2.99s\n",
+ "900:\tlearn: 0.1764789\ttest: 0.3205735\tbest: 0.3200469 (851)\ttotal: 2.28s\tremaining: 2.78s\n",
+ "1000:\tlearn: 0.1678033\ttest: 0.3209024\tbest: 0.3200469 (851)\ttotal: 2.52s\tremaining: 2.52s\n",
+ "1100:\tlearn: 0.1599763\ttest: 0.3211878\tbest: 0.3200469 (851)\ttotal: 2.79s\tremaining: 2.28s\n",
+ "Stopped by overfitting detector (300 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3200468666\n",
+ "bestIteration = 851\n",
+ "\n",
+ "Shrink model to first 852 iterations.\n",
+ "\n",
+ "===== Start working with fold 3 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5652387\ttest: 0.6041895\tbest: 0.6041895 (0)\ttotal: 3.64ms\tremaining: 7.28s\n",
+ "100:\tlearn: 0.3086002\ttest: 0.3976876\tbest: 0.3976876 (100)\ttotal: 264ms\tremaining: 4.97s\n",
+ "200:\tlearn: 0.2756635\ttest: 0.3880626\tbest: 0.3880626 (200)\ttotal: 501ms\tremaining: 4.49s\n",
+ "300:\tlearn: 0.2511624\ttest: 0.3852988\tbest: 0.3849651 (283)\ttotal: 754ms\tremaining: 4.25s\n",
+ "400:\tlearn: 0.2321052\ttest: 0.3859294\tbest: 0.3849651 (283)\ttotal: 997ms\tremaining: 3.97s\n",
+ "500:\tlearn: 0.2175329\ttest: 0.3865094\tbest: 0.3849651 (283)\ttotal: 1.26s\tremaining: 3.77s\n",
+ "Stopped by overfitting detector (300 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3849650533\n",
+ "bestIteration = 283\n",
+ "\n",
+ "Shrink model to first 284 iterations.\n",
+ "\n",
+ "===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_2_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5819025\ttest: 0.5416817\tbest: 0.5416817 (0)\ttotal: 16.8ms\tremaining: 33.5s\n",
+ "100:\tlearn: 0.3282525\ttest: 0.3302178\tbest: 0.3302178 (100)\ttotal: 292ms\tremaining: 5.5s\n",
+ "200:\tlearn: 0.2935099\ttest: 0.3155985\tbest: 0.3154939 (198)\ttotal: 559ms\tremaining: 5s\n",
+ "300:\tlearn: 0.2640830\ttest: 0.3098426\tbest: 0.3098426 (300)\ttotal: 817ms\tremaining: 4.61s\n",
+ "400:\tlearn: 0.2425758\ttest: 0.3053681\tbest: 0.3053681 (400)\ttotal: 1.08s\tremaining: 4.31s\n",
+ "500:\tlearn: 0.2263103\ttest: 0.3027193\tbest: 0.3024671 (488)\ttotal: 1.35s\tremaining: 4.05s\n",
+ "600:\tlearn: 0.2119579\ttest: 0.3012696\tbest: 0.3011144 (582)\ttotal: 1.62s\tremaining: 3.78s\n",
+ "700:\tlearn: 0.1987671\ttest: 0.2997913\tbest: 0.2997913 (700)\ttotal: 1.89s\tremaining: 3.5s\n",
+ "800:\tlearn: 0.1877085\ttest: 0.2993222\tbest: 0.2991978 (796)\ttotal: 2.15s\tremaining: 3.21s\n",
+ "900:\tlearn: 0.1779556\ttest: 0.2988217\tbest: 0.2988217 (900)\ttotal: 2.41s\tremaining: 2.94s\n",
+ "1000:\tlearn: 0.1691039\ttest: 0.2991610\tbest: 0.2988179 (904)\ttotal: 2.67s\tremaining: 2.66s\n",
+ "1100:\tlearn: 0.1609237\ttest: 0.2994798\tbest: 0.2988179 (904)\ttotal: 2.92s\tremaining: 2.38s\n",
+ "1200:\tlearn: 0.1537691\ttest: 0.2996605\tbest: 0.2988179 (904)\ttotal: 3.17s\tremaining: 2.11s\n",
+ "Stopped by overfitting detector (300 iterations wait)\n",
+ "\n",
+ "bestTest = 0.2988178835\n",
+ "bestIteration = 904\n",
+ "\n",
+ "Shrink model to first 905 iterations.\n",
+ "Lvl_0_Pipe_1_Mod_2_CatBoost fitting and predicting completed\n",
+ "Optuna may run 1 secs\n",
+ "Start fitting Lvl_0_Pipe_1_Mod_3_CatBoost ...\n",
+ "\n",
+ "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5696811\ttest: 0.5907190\tbest: 0.5907190 (0)\ttotal: 4.77ms\tremaining: 9.53s\n",
+ "100:\tlearn: 0.3328412\ttest: 0.3613108\tbest: 0.3613108 (100)\ttotal: 196ms\tremaining: 3.68s\n",
+ "200:\tlearn: 0.3027316\ttest: 0.3519878\tbest: 0.3519878 (200)\ttotal: 368ms\tremaining: 3.29s\n",
+ "300:\tlearn: 0.2794207\ttest: 0.3487270\tbest: 0.3487270 (300)\ttotal: 582ms\tremaining: 3.29s\n",
+ "400:\tlearn: 0.2634165\ttest: 0.3479933\tbest: 0.3479933 (400)\ttotal: 775ms\tremaining: 3.09s\n",
+ "500:\tlearn: 0.2504892\ttest: 0.3473465\tbest: 0.3470908 (452)\ttotal: 961ms\tremaining: 2.88s\n",
+ "600:\tlearn: 0.2399381\ttest: 0.3461447\tbest: 0.3458834 (593)\ttotal: 1.17s\tremaining: 2.72s\n",
+ "700:\tlearn: 0.2293243\ttest: 0.3446773\tbest: 0.3445460 (694)\ttotal: 1.35s\tremaining: 2.5s\n",
+ "800:\tlearn: 0.2201892\ttest: 0.3445020\tbest: 0.3441724 (728)\ttotal: 1.56s\tremaining: 2.34s\n",
+ "900:\tlearn: 0.2122752\ttest: 0.3439144\tbest: 0.3438057 (883)\ttotal: 1.76s\tremaining: 2.15s\n",
+ "1000:\tlearn: 0.2050941\ttest: 0.3442635\tbest: 0.3438057 (883)\ttotal: 1.95s\tremaining: 1.95s\n",
+ "1100:\tlearn: 0.1984862\ttest: 0.3437866\tbest: 0.3437749 (1051)\ttotal: 2.14s\tremaining: 1.75s\n",
+ "1200:\tlearn: 0.1924283\ttest: 0.3433972\tbest: 0.3433483 (1198)\ttotal: 2.36s\tremaining: 1.57s\n",
+ "1300:\tlearn: 0.1863886\ttest: 0.3431625\tbest: 0.3430420 (1288)\ttotal: 2.56s\tremaining: 1.38s\n",
+ "1400:\tlearn: 0.1807727\ttest: 0.3427477\tbest: 0.3427039 (1394)\ttotal: 2.74s\tremaining: 1.17s\n",
+ "1500:\tlearn: 0.1756772\ttest: 0.3423999\tbest: 0.3423999 (1500)\ttotal: 2.95s\tremaining: 979ms\n",
+ "1600:\tlearn: 0.1709954\ttest: 0.3426386\tbest: 0.3421255 (1541)\ttotal: 3.12s\tremaining: 779ms\n",
+ "1700:\tlearn: 0.1662324\ttest: 0.3423893\tbest: 0.3421255 (1541)\ttotal: 3.35s\tremaining: 589ms\n",
+ "1800:\tlearn: 0.1620149\ttest: 0.3419531\tbest: 0.3419456 (1796)\ttotal: 3.52s\tremaining: 390ms\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1900:\tlearn: 0.1579787\ttest: 0.3420298\tbest: 0.3418931 (1815)\ttotal: 3.71s\tremaining: 193ms\n",
+ "1999:\tlearn: 0.1538067\ttest: 0.3423697\tbest: 0.3418931 (1815)\ttotal: 3.89s\tremaining: 0us\n",
+ "\n",
+ "bestTest = 0.3418931473\n",
+ "bestIteration = 1815\n",
+ "\n",
+ "Shrink model to first 1816 iterations.\n",
+ "Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed\n",
+ "Start fitting Lvl_0_Pipe_1_Mod_3_CatBoost ...\n",
+ "\n",
+ "===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5749238\ttest: 0.5962756\tbest: 0.5962756 (0)\ttotal: 3.23ms\tremaining: 9.69s\n",
+ "100:\tlearn: 0.3538624\ttest: 0.3750769\tbest: 0.3750769 (100)\ttotal: 205ms\tremaining: 5.9s\n",
+ "200:\tlearn: 0.3247688\ttest: 0.3579003\tbest: 0.3579003 (200)\ttotal: 397ms\tremaining: 5.53s\n",
+ "300:\tlearn: 0.3067646\ttest: 0.3529264\tbest: 0.3529264 (300)\ttotal: 596ms\tremaining: 5.34s\n",
+ "400:\tlearn: 0.2906869\ttest: 0.3494385\tbest: 0.3494385 (400)\ttotal: 773ms\tremaining: 5.01s\n",
+ "500:\tlearn: 0.2787513\ttest: 0.3480935\tbest: 0.3480935 (500)\ttotal: 940ms\tremaining: 4.69s\n",
+ "600:\tlearn: 0.2682686\ttest: 0.3470561\tbest: 0.3470561 (600)\ttotal: 1.14s\tremaining: 4.56s\n",
+ "700:\tlearn: 0.2590796\ttest: 0.3463205\tbest: 0.3463073 (694)\ttotal: 1.33s\tremaining: 4.36s\n",
+ "800:\tlearn: 0.2514009\ttest: 0.3451208\tbest: 0.3451208 (800)\ttotal: 1.51s\tremaining: 4.16s\n",
+ "900:\tlearn: 0.2444905\ttest: 0.3440930\tbest: 0.3440841 (895)\ttotal: 1.71s\tremaining: 3.98s\n",
+ "1000:\tlearn: 0.2378982\ttest: 0.3435150\tbest: 0.3434602 (992)\ttotal: 1.9s\tremaining: 3.79s\n",
+ "1100:\tlearn: 0.2322087\ttest: 0.3431667\tbest: 0.3431153 (1075)\ttotal: 2.1s\tremaining: 3.63s\n",
+ "1200:\tlearn: 0.2265210\ttest: 0.3425909\tbest: 0.3425467 (1188)\ttotal: 2.29s\tremaining: 3.42s\n",
+ "1300:\tlearn: 0.2213036\ttest: 0.3420838\tbest: 0.3420157 (1288)\ttotal: 2.46s\tremaining: 3.21s\n",
+ "1400:\tlearn: 0.2163428\ttest: 0.3418436\tbest: 0.3417805 (1394)\ttotal: 2.67s\tremaining: 3.05s\n",
+ "1500:\tlearn: 0.2115906\ttest: 0.3414778\tbest: 0.3414778 (1500)\ttotal: 2.88s\tremaining: 2.87s\n",
+ "1600:\tlearn: 0.2070074\ttest: 0.3410625\tbest: 0.3410625 (1600)\ttotal: 3.09s\tremaining: 2.7s\n",
+ "1700:\tlearn: 0.2026507\ttest: 0.3405289\tbest: 0.3405001 (1679)\ttotal: 3.27s\tremaining: 2.5s\n",
+ "1800:\tlearn: 0.1988309\ttest: 0.3401400\tbest: 0.3401251 (1798)\ttotal: 3.48s\tremaining: 2.31s\n",
+ "1900:\tlearn: 0.1952427\ttest: 0.3398704\tbest: 0.3396758 (1880)\ttotal: 3.68s\tremaining: 2.13s\n",
+ "2000:\tlearn: 0.1915851\ttest: 0.3396242\tbest: 0.3396191 (1999)\ttotal: 3.88s\tremaining: 1.94s\n",
+ "2100:\tlearn: 0.1882232\ttest: 0.3395831\tbest: 0.3395527 (2074)\ttotal: 4.07s\tremaining: 1.74s\n",
+ "2200:\tlearn: 0.1848821\ttest: 0.3393809\tbest: 0.3393421 (2165)\ttotal: 4.31s\tremaining: 1.56s\n",
+ "2300:\tlearn: 0.1817400\ttest: 0.3391877\tbest: 0.3391877 (2300)\ttotal: 4.5s\tremaining: 1.37s\n",
+ "2400:\tlearn: 0.1787974\ttest: 0.3391625\tbest: 0.3391625 (2400)\ttotal: 4.69s\tremaining: 1.17s\n",
+ "2500:\tlearn: 0.1756702\ttest: 0.3388642\tbest: 0.3388608 (2499)\ttotal: 4.88s\tremaining: 973ms\n",
+ "2600:\tlearn: 0.1728922\ttest: 0.3389253\tbest: 0.3388231 (2508)\ttotal: 5.09s\tremaining: 781ms\n",
+ "Stopped by overfitting detector (100 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3388231235\n",
+ "bestIteration = 2508\n",
+ "\n",
+ "Shrink model to first 2509 iterations.\n",
+ "\n",
+ "===== Start working with fold 1 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5830078\ttest: 0.5633221\tbest: 0.5633221 (0)\ttotal: 8.6ms\tremaining: 25.8s\n",
+ "100:\tlearn: 0.3589969\ttest: 0.3533597\tbest: 0.3533597 (100)\ttotal: 196ms\tremaining: 5.61s\n",
+ "200:\tlearn: 0.3307810\ttest: 0.3362463\tbest: 0.3362463 (200)\ttotal: 377ms\tremaining: 5.25s\n",
+ "300:\tlearn: 0.3120042\ttest: 0.3301034\tbest: 0.3301034 (300)\ttotal: 551ms\tremaining: 4.94s\n",
+ "400:\tlearn: 0.2950774\ttest: 0.3273159\tbest: 0.3273074 (398)\ttotal: 732ms\tremaining: 4.74s\n",
+ "500:\tlearn: 0.2822802\ttest: 0.3260960\tbest: 0.3260960 (500)\ttotal: 908ms\tremaining: 4.53s\n",
+ "600:\tlearn: 0.2718319\ttest: 0.3244773\tbest: 0.3244541 (597)\ttotal: 1.11s\tremaining: 4.42s\n",
+ "700:\tlearn: 0.2631367\ttest: 0.3236967\tbest: 0.3236967 (700)\ttotal: 1.29s\tremaining: 4.23s\n",
+ "800:\tlearn: 0.2555498\ttest: 0.3241221\tbest: 0.3236444 (718)\ttotal: 1.48s\tremaining: 4.05s\n",
+ "Stopped by overfitting detector (100 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3236443836\n",
+ "bestIteration = 718\n",
+ "\n",
+ "Shrink model to first 719 iterations.\n",
+ "\n",
+ "===== Start working with fold 2 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5784948\ttest: 0.5817224\tbest: 0.5817224 (0)\ttotal: 15.4ms\tremaining: 46.3s\n",
+ "100:\tlearn: 0.3581544\ttest: 0.3571058\tbest: 0.3571058 (100)\ttotal: 211ms\tremaining: 6.05s\n",
+ "200:\tlearn: 0.3282811\ttest: 0.3383338\tbest: 0.3383338 (200)\ttotal: 395ms\tremaining: 5.5s\n",
+ "300:\tlearn: 0.3097713\ttest: 0.3322135\tbest: 0.3320776 (297)\ttotal: 573ms\tremaining: 5.14s\n",
+ "400:\tlearn: 0.2938576\ttest: 0.3299511\tbest: 0.3299511 (400)\ttotal: 769ms\tremaining: 4.98s\n",
+ "500:\tlearn: 0.2805774\ttest: 0.3287883\tbest: 0.3287883 (500)\ttotal: 946ms\tremaining: 4.72s\n",
+ "600:\tlearn: 0.2703739\ttest: 0.3269768\tbest: 0.3269716 (599)\ttotal: 1.13s\tremaining: 4.5s\n",
+ "700:\tlearn: 0.2616548\ttest: 0.3258652\tbest: 0.3257247 (696)\ttotal: 1.31s\tremaining: 4.29s\n",
+ "800:\tlearn: 0.2540794\ttest: 0.3248115\tbest: 0.3246713 (780)\ttotal: 1.49s\tremaining: 4.09s\n",
+ "900:\tlearn: 0.2470055\ttest: 0.3240086\tbest: 0.3239652 (896)\ttotal: 1.68s\tremaining: 3.9s\n",
+ "1000:\tlearn: 0.2407159\ttest: 0.3232904\tbest: 0.3232768 (993)\ttotal: 1.87s\tremaining: 3.74s\n",
+ "1100:\tlearn: 0.2349685\ttest: 0.3226389\tbest: 0.3224739 (1072)\ttotal: 2.08s\tremaining: 3.58s\n",
+ "1200:\tlearn: 0.2291905\ttest: 0.3219706\tbest: 0.3219349 (1195)\ttotal: 2.28s\tremaining: 3.42s\n",
+ "1300:\tlearn: 0.2242843\ttest: 0.3216330\tbest: 0.3216282 (1299)\ttotal: 2.48s\tremaining: 3.24s\n",
+ "1400:\tlearn: 0.2193816\ttest: 0.3213742\tbest: 0.3211652 (1345)\ttotal: 2.71s\tremaining: 3.09s\n",
+ "1500:\tlearn: 0.2150231\ttest: 0.3210364\tbest: 0.3209320 (1480)\ttotal: 2.9s\tremaining: 2.9s\n",
+ "1600:\tlearn: 0.2108812\ttest: 0.3206202\tbest: 0.3206202 (1600)\ttotal: 3.08s\tremaining: 2.69s\n",
+ "1700:\tlearn: 0.2066767\ttest: 0.3199278\tbest: 0.3199118 (1699)\ttotal: 3.27s\tremaining: 2.5s\n",
+ "1800:\tlearn: 0.2027083\ttest: 0.3198993\tbest: 0.3198815 (1726)\ttotal: 3.48s\tremaining: 2.31s\n",
+ "Stopped by overfitting detector (100 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3198814937\n",
+ "bestIteration = 1726\n",
+ "\n",
+ "Shrink model to first 1727 iterations.\n",
+ "\n",
+ "===== Start working with fold 3 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5710290\ttest: 0.6096694\tbest: 0.6096694 (0)\ttotal: 2.86ms\tremaining: 8.58s\n",
+ "100:\tlearn: 0.3444958\ttest: 0.4152803\tbest: 0.4152803 (100)\ttotal: 224ms\tremaining: 6.42s\n",
+ "200:\tlearn: 0.3159040\ttest: 0.3996337\tbest: 0.3996337 (200)\ttotal: 420ms\tremaining: 5.85s\n",
+ "300:\tlearn: 0.2999551\ttest: 0.3924528\tbest: 0.3924528 (300)\ttotal: 601ms\tremaining: 5.38s\n",
+ "400:\tlearn: 0.2851919\ttest: 0.3881614\tbest: 0.3881614 (400)\ttotal: 768ms\tremaining: 4.98s\n",
+ "500:\tlearn: 0.2728422\ttest: 0.3868374\tbest: 0.3867807 (490)\ttotal: 936ms\tremaining: 4.67s\n",
+ "600:\tlearn: 0.2631856\ttest: 0.3861455\tbest: 0.3861455 (600)\ttotal: 1.1s\tremaining: 4.4s\n",
+ "700:\tlearn: 0.2550000\ttest: 0.3857666\tbest: 0.3856754 (668)\ttotal: 1.29s\tremaining: 4.25s\n",
+ "800:\tlearn: 0.2474017\ttest: 0.3856947\tbest: 0.3856013 (792)\ttotal: 1.49s\tremaining: 4.09s\n",
+ "900:\tlearn: 0.2405549\ttest: 0.3855558\tbest: 0.3855096 (888)\ttotal: 1.68s\tremaining: 3.9s\n",
+ "Stopped by overfitting detector (100 iterations wait)\n",
+ "\n",
+ "bestTest = 0.3855095628\n",
+ "bestIteration = 888\n",
+ "\n",
+ "Shrink model to first 889 iterations.\n",
+ "\n",
+ "===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_3_CatBoost =====\n",
+ "\n",
+ "0:\tlearn: 0.5877937\ttest: 0.5433457\tbest: 0.5433457 (0)\ttotal: 2.46ms\tremaining: 7.39s\n",
+ "100:\tlearn: 0.3635841\ttest: 0.3502432\tbest: 0.3502432 (100)\ttotal: 205ms\tremaining: 5.9s\n",
+ "200:\tlearn: 0.3350821\ttest: 0.3277620\tbest: 0.3277620 (200)\ttotal: 394ms\tremaining: 5.48s\n",
+ "300:\tlearn: 0.3163325\ttest: 0.3186592\tbest: 0.3186268 (298)\ttotal: 581ms\tremaining: 5.21s\n",
+ "400:\tlearn: 0.2991828\ttest: 0.3114547\tbest: 0.3114547 (400)\ttotal: 765ms\tremaining: 4.96s\n",
+ "500:\tlearn: 0.2853312\ttest: 0.3067744\tbest: 0.3067744 (500)\ttotal: 961ms\tremaining: 4.79s\n",
+ "600:\tlearn: 0.2746053\ttest: 0.3043128\tbest: 0.3043128 (600)\ttotal: 1.19s\tremaining: 4.75s\n",
+ "700:\tlearn: 0.2651369\ttest: 0.3024191\tbest: 0.3024121 (699)\ttotal: 1.4s\tremaining: 4.6s\n",
+ "800:\tlearn: 0.2573767\ttest: 0.3007672\tbest: 0.3007672 (800)\ttotal: 1.61s\tremaining: 4.43s\n",
+ "900:\tlearn: 0.2503800\ttest: 0.2996296\tbest: 0.2996044 (890)\ttotal: 1.85s\tremaining: 4.3s\n",
+ "1000:\tlearn: 0.2433897\ttest: 0.2990408\tbest: 0.2990408 (1000)\ttotal: 2.05s\tremaining: 4.1s\n",
+ "1100:\tlearn: 0.2373764\ttest: 0.2977549\tbest: 0.2977112 (1099)\ttotal: 2.24s\tremaining: 3.87s\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1200:\tlearn: 0.2311849\ttest: 0.2972003\tbest: 0.2971525 (1196)\ttotal: 2.42s\tremaining: 3.63s\n",
+ "1300:\tlearn: 0.2256356\ttest: 0.2963918\tbest: 0.2963918 (1300)\ttotal: 2.62s\tremaining: 3.42s\n",
+ "1400:\tlearn: 0.2204745\ttest: 0.2958023\tbest: 0.2958023 (1400)\ttotal: 2.81s\tremaining: 3.2s\n",
+ "1500:\tlearn: 0.2158370\ttest: 0.2953656\tbest: 0.2953011 (1496)\ttotal: 2.98s\tremaining: 2.97s\n",
+ "1600:\tlearn: 0.2115284\ttest: 0.2947484\tbest: 0.2947203 (1567)\ttotal: 3.14s\tremaining: 2.75s\n",
+ "1700:\tlearn: 0.2073122\ttest: 0.2944945\tbest: 0.2944290 (1697)\ttotal: 3.37s\tremaining: 2.57s\n",
+ "Stopped by overfitting detector (100 iterations wait)\n",
+ "\n",
+ "bestTest = 0.2944289789\n",
+ "bestIteration = 1697\n",
+ "\n",
+ "Shrink model to first 1698 iterations.\n",
+ "Lvl_0_Pipe_1_Mod_3_CatBoost fitting and predicting completed\n",
+ "Time left 37.47469425201416\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Time limit exceeded in one of the tasks. AutoML will blend level 1 models.\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Blending: Optimization starts with equal weights and score -1.7954214419609544\n",
+ "Blending, iter 0: score = -1.4724501895733335, weights = [0. 0.86662203 0.05572809 0.07764989]\n",
+ "Blending, iter 1: score = -1.4661120519816215, weights = [0. 0.7553367 0.10243508 0.14222825]\n",
+ "Blending, iter 2: score = -1.4661120519816215, weights = [0. 0.7553367 0.10243508 0.14222825]\n",
+ "No score update. Terminated\n",
+ "\n",
+ "Automl preset training completed in 66.05 seconds.\n"
+ ]
+ }
+ ],
+ "source": [
+ "automl = TabularUtilizedAutoML(task = task,\n",
+ " timeout = TIMEOUT,\n",
+ " cpu_limit = N_THREADS,\n",
+ " reader_params = {'n_jobs': N_THREADS,\n",
+ " 'cv': N_FOLDS,\n",
+ " 'random_state': RANDOM_STATE})\n",
+ "\n",
+ "oof_pred = automl.fit_predict(train_data, roles = roles)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#deviation_metric(np.array(train_df['per_square_meter_price']), oof_pred.data[:, 0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9\n",
+ "\n",
+ "output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \\\n",
+ " = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {
+ "id": "e0d6dfe7"
+ },
+ "outputs": [],
+ "source": [
+ "output = pd.DataFrame({'id': test_data['id'],\n",
+ " 'per_square_meter_price': automl.predict(test_data).data[:, 0]})\n",
+ "output['per_square_meter_price'] = output['per_square_meter_price'] * 0.9\n",
+ "\n",
+ "output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \\\n",
+ " = output.loc[output['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9\n",
+ "\n",
+ "output.to_csv('raifHack_ki7.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2974, 2)"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "output.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "housePrice_AutoML.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Yapiki_publ/Read me.txt b/Yapiki_publ/Read me.txt
new file mode 100644
index 0000000..4f3107e
--- /dev/null
+++ b/Yapiki_publ/Read me.txt
@@ -0,0 +1,39 @@
+Решение основано на алгоритме LightAutoML с дополнительным обогащением данных из открытых источников (Росстат).
+
+
+- Проведен EDA инструментами pandas_profile и сравнениями корреляции метрик разными методами в зависимости от фичей -
+Pandas profiling - https://drive.google.com/file/d/1xQl3LvpX9J0G6gJoaBjzRcBFKZi6QZXz/view?usp=sharing
+- Введены дополнительные метрики: премиальность этажей и типирование города
+- Использованы дополнительные стат данные (среднемесячная заработная плата и численность населения по регионам, Источник: Росстат);
+- Учтен дисконт, закладываемый при ручном расчете (использовали вариант из публичного решения, ссылка на github: https://github.com/BatyaZhizni/Raifhack-DS)
+
+
+Для обогащения данных использованы дополнительные датасеты:
+
+
+1. zarplaty.xlsx - Среднемесячная номинальная начисленная заработная плата работников в целом по экономике по субъектам Российской Федерации за 2000-2020 гг. Источник: Росстат, https://rosstat.gov.ru/labor_market_employment_salaries Ссылка - https://docs.google.com/spreadsheets/d/1S1ORmz2W4QTG-d8odUOqT6Czu21NF2Vw/edit?usp=sharing&ouid=108685579276627434305&rtpof=true&sd=true
+2. zarplaty.xlsx - Численность населения по населенным пунктам России. Источник: Росстат, https://rosstat.gov.ru/folder/12781. Ссылка: https://drive.google.com/file/d/19hJI_zlTZboxSh_JwPrWt8vYx9lkNlM0/view?usp=sharing
+
+
+Для обучения использовали модель LightAutoML*
+
+
+1. LightAutoML project from Sberbank AI Lab AutoML group is the framework for automatic classification and regression model creation.
+
+
+Authors: Alexander Ryzhkov, Anton Vakhrushev, Dmitry Simakov, Vasilii Bunakov, Rinchin Damdinov, Pavel Shvets, Alexander Kirilin
+
+
+2. Библиотеки
+!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
+import pandas as pd
+import numpy as np
+import typing
+import torch
+import matplotlib.pyplot as plt
+
+
+!pip install -U lightautoml
+from sklearn.model_selection import train_test_split
+from lightautoml.automl.presets.tabular_presets import TabularUtilizedAutoML
+from lightautoml.tasks import Task
\ No newline at end of file
diff --git a/Yapiki_publ/city_population.rar b/Yapiki_publ/city_population.rar
new file mode 100644
index 0000000..4cad566
Binary files /dev/null and b/Yapiki_publ/city_population.rar differ
diff --git a/Yapiki_publ/zarplaty.xlsx b/Yapiki_publ/zarplaty.xlsx
new file mode 100644
index 0000000..296673c
Binary files /dev/null and b/Yapiki_publ/zarplaty.xlsx differ