{"id":4390,"date":"2024-01-29T20:10:24","date_gmt":"2024-01-29T11:10:24","guid":{"rendered":"https:\/\/blog.since2020.jp\/?p=4390"},"modified":"2024-01-29T20:14:24","modified_gmt":"2024-01-29T11:14:24","slug":"%e3%81%98%e3%82%83%e3%82%89%e3%82%93%e3%81%ae%e5%8f%a3%e3%82%b3%e3%83%9f%e3%82%92%e3%82%b9%e3%82%af%e3%83%ac%e3%82%a4%e3%83%94%e3%83%b3%e3%82%b0%ef%bc%86%e5%8f%af%e8%a6%96%e5%8c%96%e3%81%97%e3%81%a6","status":"publish","type":"post","link":"https:\/\/since2020.jp\/media\/%e3%81%98%e3%82%83%e3%82%89%e3%82%93%e3%81%ae%e5%8f%a3%e3%82%b3%e3%83%9f%e3%82%92%e3%82%b9%e3%82%af%e3%83%ac%e3%82%a4%e3%83%94%e3%83%b3%e3%82%b0%ef%bc%86%e5%8f%af%e8%a6%96%e5%8c%96%e3%81%97%e3%81%a6\/","title":{"rendered":"\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u3092\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\uff06\u53ef\u8996\u5316\u3057\u3066\u307f\u305f"},"content":{"rendered":"\n<p>\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u3092\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\uff06\u53ef\u8996\u5316\u3057\u3066\u307f\u305f<\/p>\n\n\n<h2>\u306f\u3058\u3081\u306b<\/h2>\n<p>\u3058\u3083\u3089\u3093\u306f\u53e3\u30b3\u30df\u306e\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u306f\u8a31\u53ef\u3092\u51fa\u3057\u3066\u3044\u307e\u3059\u3002\uff082024\/01\/18\u6642\u70b9)<\/p>\r\n<p>\u306a\u306e\u3067\u7df4\u7fd2\u304c\u3066\u3089\u306b\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3092\u3057\u3066\u307f\u307e\u3057\u3087\u3046\uff01<\/p>\r\n<p>\u79c1\u306fBeautifulSoup\u3092\u7528\u3044\u3066\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3057\u307e\u3057\u305f\u3002\u96d1\u306a\u30b3\u30fc\u30c9\u306a\u306e\u3067\u3059\u307f\u307e\u305b\u3093\u3002<\/p>\r\n<p>Streamlit\u3067\u53ef\u8996\u5316\u3057\u305f\u8a18\u4e8b\u306f\u3053\u3061\u3089\u2193<\/p>\r\n<p><a href=\"https:\/\/blog.since2020.jp\/data_analysis\/google-colab%e3%81%8b%e3%82%89streamlit%e3%81%a7%e3%81%98%e3%82%83%e3%82%89%e3%82%93%e3%81%ae%e5%8f%a3%e3%82%b3%e3%83%9f%e5%88%86%e6%9e%90%e7%b5%90%e6%9e%9c%e3%82%92%e5%8f%af%e8%a6%96%e5%8c%96\/\">blog.since2020.jp\/data_analysis\/google-colab\u304b\u3089streamlit\u3067\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u5206\u6790\u7d50\u679c\u3092\u53ef\u8996\u5316\/<\/a><\/p>\n\n<h2>\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u3092\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0<\/h2>\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>\r\nfrom bs4 import BeautifulSoup\r\nimport urllib\r\nimport pandas as pd\r\nimport requests\r\n\r\nhtml = requests.get('https:\/\/www.jalan.net\/yad382598\/kuchikomi\/?screenId=UWW3001&amp;yadNo=382598&amp;smlCd=136802&amp;distCd=01&amp;ccnt=lean-kuchikomi-link-2')\r\nsoup = BeautifulSoup(html.content,'html.parser')\r\n\r\nurl_list = ['https:\/\/www.jalan.net\/yad382598\/kuchikomi\/?screenId=UWW3001&amp;yadNo=382598&amp;smlCd=136802&amp;distCd=01&amp;ccnt=lean-kuchikomi-link-2']\r\n\r\n# \u5404 URL \u304b\u3089\u53d6\u5f97\u3057\u305f\u30c7\u30fc\u30bf\u3092\u683c\u7d0d\u3059\u308b\u305f\u3081\u306e\u30ea\u30b9\u30c8\r\npost_body_text = []\r\nlead_text = []\r\nplan_info_text = []\r\npost_date_text = []\r\nrate_list_text = []\r\nhotel_name_text = []\r\nintegrated_rate_text = []\r\ncat_table_text = []\r\nc_label_text = []\r\n\r\nfor row in url_list:\r\nrow = str(row)\r\nhtml = urllib.request.urlopen(row)\r\nsoup = BeautifulSoup(html, 'html.parser')\r\n\r\n# \u5404\u8981\u7d20\u3092\u62bd\u51fa\r\npost_body = soup.find_all(\"p\", class_='jlnpc-kuchikomiCassette__postBody')\r\nlead = soup.find_all(\"p\", class_='jlnpc-kuchikomiCassette__lead')\r\nplan_info = soup.find_all(\"dl\", class_='jlnpc-kuchikomiCassette__planInfoList')\r\npost_date = soup.find_all(\"p\", class_='jlnpc-kuchikomiCassette__postDate')\r\nrate_list = soup.find_all(\"dl\", class_='jlnpc-kuchikomiCassette__rateList')\r\nhotel_name = soup.find_all(\"p\", class_='jlnpc-styleguide-scope jlnpc-yado__subTitle')\r\nrating = soup.find_all('span', class_='jlnpc-kuchikomi__point')\r\ncat_table = soup.find_all(\"table\", class_='jlnpc-kuchikomi__catTable')\r\nc_label=soup.find_all(\"span\", class_='c-label')\r\n\r\n# \u30c6\u30ad\u30b9\u30c8\u3092\u62bd\u51fa\u3057\u3001\u30ea\u30b9\u30c8\u306b\u683c\u7d0d\r\npost_body_text = [element.get_text().strip() for element in post_body]\r\nlead_text = [element.get_text().strip() for element in lead]\r\nplan_info_text = [element.get_text().strip() for element in plan_info]\r\npost_date_text = [element.get_text().strip() for element in post_date]\r\nrate_list_text = [element.get_text().strip() for element in rate_list]\r\nhotel_name_text = [element.get_text().strip() for element in hotel_name]\r\nintegrated_rate_text = [element.get_text().strip() for element in rating]\r\ncat_table_text = [element.get_text().strip() for element in cat_table]\r\nc_label_text = [element.get_text().strip() for element in c_label]\r\n\r\nimport re\r\n\r\nsex = []\r\nage = []\r\npurpose = [] # \u5229\u7528\u7528\u9014\r\nroom_type = [] # \u90e8\u5c4b\u30bf\u30a4\u30d7\r\nmeal_type = [] # \u98df\u4e8b\u30bf\u30a4\u30d7\r\npost_date_text_v2 = [] #\u6295\u7a3f\u65e5\r\nperiods = [] # \u6642\u671f\r\nplans = [] # \u30d7\u30e9\u30f3\u540d\r\nprices = [] # \u4fa1\u683c\u5e2f\r\nlow_prices = [] # \u4fa1\u683c\u5e2f\r\nhigh_prices = [] # \u4fa1\u683c\u5e2f\r\nroom_ratings = []\r\nbath_ratings = []\r\nbreakfast_ratings = []\r\ndinner_ratings = []\r\nservice_ratings = []\r\ncleanliness_ratings = []\r\noverall_room_rating = []\r\noverall_bath_rating = []\r\noverall_breakfast_rating = []\r\noverall_dinner_rating = []\r\noverall_service_rating = []\r\noverall_cleanliness_rating = []\r\n\r\nfor i in range(0, len(c_label_text), 4): # 4\u3064\u306e\u8981\u7d20\u3054\u3068\u306b\u30eb\u30fc\u30d7\r\n# \u6027\u5225\u3068\u5e74\u9f62\u306e\u62bd\u51fa\r\nsex_age = c_label_text[i].strip().split('\/')\r\nif len(sex_age) == 2:\r\nsex.append(sex_age[0])\r\nage.append(sex_age[1])\r\nelse:\r\nsex.append(None)\r\nage.append(None)\r\n\r\n# \u5229\u7528\u7528\u9014\u306e\u62bd\u51fa\r\npurpose.append(c_label_text[i + 1].strip() if i + 1 &lt; len(c_label_text) else None)\r\n\r\n# \u90e8\u5c4b\u30bf\u30a4\u30d7\u306e\u62bd\u51fa\r\nroom_type.append(c_label_text[i + 2].strip() if i + 2 &lt; len(c_label_text) else None)\r\n\r\n# \u98df\u4e8b\u30bf\u30a4\u30d7\u306e\u62bd\u51fa\r\nmeal_type.append(c_label_text[i + 3].strip() if i + 3 &lt; len(c_label_text) else None)\r\n\r\n# \u6b63\u898f\u8868\u73fe\u30d1\u30bf\u30fc\u30f3\u306e\u5b9a\u7fa9\r\ndate_pattern = r'\\d{4}\/\\d{1,2}\/\\d{1,2}'\r\n\r\nfor text in post_date_text:\r\n# \u6b63\u898f\u8868\u73fe\u3092\u4f7f\u7528\u3057\u3066\u65e5\u4ed8\u3092\u62bd\u51fa\r\nmatch = re.search(date_pattern, text)\r\nif match:\r\n# \u62bd\u51fa\u3057\u305f\u65e5\u4ed8\u3092\u65b0\u3057\u3044\u30ea\u30b9\u30c8\u306b\u8ffd\u52a0\r\npost_date_text_v2.append(match.group())\r\n\r\nfor item in plan_info_text:\r\nif item.startswith('\u6642\u671f'):\r\nperiods.append(item.split('\\n')[1].split('\u5bbf\u6cca')[0]) # \u6642\u671f\u306e\u60c5\u5831\u3092\u8ffd\u52a0\r\nelif item.startswith('\u30d7\u30e9\u30f3'):\r\nplans.append(item.split('\\n')[1]) # \u30d7\u30e9\u30f3\u540d\u306e\u60c5\u5831\u3092\u8ffd\u52a0\r\nelif item.startswith('\u4fa1\u683c\u5e2f'):\r\nprices.append(item.split('\\n')[-1].split('\u5186')[0]) # \u4fa1\u683c\u5e2f\u306e\u60c5\u5831\u3092\u8ffd\u52a0\r\nif '\u301c' in item.split('\\n')[-1].split('\u5186')[0]:\r\nlow_prices.append(item.split('\\n')[-1].split('\u301c')[0])\r\nhigh_prices.append(item.split('\\n')[-1].split('\u5186')[0].split('\u301c')[1])\r\nelif '\uff5e' in item.split('\\n')[-1].split('\u5186')[0]:\r\nlow_prices.append(item.split('\\n')[-1].split('\uff5e')[0])\r\nhigh_prices.append(item.split('\\n')[-1].split('\u5186')[0].split('\uff5e')[1])\r\n\r\nfor item in rate_list_text:\r\nlines = item.split('\\n') # \u5404\u884c\u306b\u5206\u5272\r\nfor i in range(0, len(lines), 2): # 2\u3064\u306e\u8981\u7d20\u3054\u3068\u306b\u30eb\u30fc\u30d7\uff08\u30ab\u30c6\u30b4\u30ea\u30fc\u540d\u3068\u8a55\u4fa1\uff09\r\ncategory = lines[i]\r\nrating = lines[i + 1]\r\nif '\u90e8\u5c4b' in category:\r\nroom_ratings.append(rating)\r\nelif '\u98a8\u5442' in category:\r\nbath_ratings.append(rating)\r\nelif '\u6599\u7406(\u671d\u98df)' in category:\r\nbreakfast_ratings.append(rating)\r\nelif '\u6599\u7406(\u5915\u98df)' in category:\r\ndinner_ratings.append(rating)\r\nelif '\u63a5\u5ba2\u30fb\u30b5\u30fc\u30d3\u30b9' in category:\r\nservice_ratings.append(rating)\r\nelif '\u6e05\u6f54\u611f' in category:\r\ncleanliness_ratings.append(rating)\r\n\r\nfor item in cat_table_text:\r\nlines = item.split('\\n')\r\n# \u5404\u30ab\u30c6\u30b4\u30ea\u306e\u8a55\u4fa1\u3092\u62bd\u51fa\r\noverall_room_rating = lines[lines.index('\u90e8\u5c4b') + 1]\r\noverall_bath_rating = lines[lines.index('\u98a8\u5442') + 1]\r\noverall_breakfast_rating = lines[lines.index('\u6599\u7406\uff08\u671d\u98df\uff09') + 1]\r\noverall_dinner_rating = lines[lines.index('\u6599\u7406\uff08\u5915\u98df\uff09') + 1]\r\noverall_service_rating = lines[lines.index('\u63a5\u5ba2\u30fb\u30b5\u30fc\u30d3\u30b9') + 1]\r\noverall_cleanliness_rating = lines[lines.index('\u6e05\u6f54\u611f') + 1]\r\n\r\n# DataFrame\u3092\u4f5c\u6210\r\ndata = pd.DataFrame({\r\n'sex': sex,\r\n'age': age,\r\n'purpose': purpose,\r\n'room_type': room_type,\r\n'meal_type': meal_type,\r\n'post_body': post_body_text,\r\n'post_lead': lead_text,\r\n'post_date': post_date_text_v2,\r\n'periods': periods,\r\n'plans' : plans,\r\n'prices': prices,\r\n'low_prices' : low_prices,\r\n'high_prices' : high_prices,\r\n'room_ratings' : room_ratings,\r\n'bath_ratings' : bath_ratings,\r\n'breakfast_ratings' : breakfast_ratings,\r\n'dinner_ratings' : dinner_ratings,\r\n'service_ratings' : service_ratings,\r\n'cleanliness_ratings' : cleanliness_ratings\r\n})\r\n\r\n\r\n# \u30db\u30c6\u30eb\u540d\u3060\u3051\u3092\u62bd\u51fa\r\nhotel_names = [name.split('\u306e\u30af\u30c1\u30b3\u30df\u30fb\u8a55\u4fa1')[0] for name in hotel_name_text]\r\n\r\n# \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306e\u3059\u3079\u3066\u306e\u884c\u306b\u540c\u3058\u30db\u30c6\u30eb\u540d\u3092\u9069\u7528\r\ndata['\u30db\u30c6\u30eb\u540d'] = hotel_names*len(data)\r\ndata['\u7dcf\u5408\u8a55\u4fa1'] = integrated_rate_text*len(data)\r\n# \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306b\u5404\u30ab\u30c6\u30b4\u30ea\u306e\u8a55\u4fa1\u306e\u5217\u3092\u8ffd\u52a0\r\ndata['\u7dcf\u5408\u90e8\u5c4b\u8a55\u4fa1'] = overall_room_rating\r\ndata['\u7dcf\u5408\u98a8\u5442\u8a55\u4fa1'] = overall_bath_rating\r\ndata['\u7dcf\u5408\u671d\u98df\u8a55\u4fa1'] = overall_breakfast_rating\r\ndata['\u7dcf\u5408\u5915\u98df\u8a55\u4fa1'] = overall_dinner_rating\r\ndata['\u7dcf\u5408\u63a5\u5ba2\u30fb\u30b5\u30fc\u30d3\u30b9\u8a55\u4fa1'] = overall_service_rating\r\ndata['\u7dcf\u5408\u6e05\u6f54\u611f\u8a55\u4fa1'] = overall_cleanliness_rating<\/code><\/pre>\r\n<p>\u3053\u308c\u3092\u884c\u3046\u3053\u3068\u3067\u53e3\u30b3\u30df\u304c\u30c7\u30fc\u30bf\u306b\u683c\u7d0d\u3067\u304d\u307e\u3057\u305f\u3002<\/p>\r\n<\/div>\n\n<h2>\u9867\u5ba2\u5c5e\u6027\u3054\u3068\u306b\u53e3\u30b3\u30df\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u53ef\u8996\u5316<\/h2>\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code># 'prices' \u3054\u3068\u306e\u6027\u5225\u5272\u5408\u3092\u8a08\u7b97\r\nprices_sex_distribution = data.groupby('prices')['sex'].value_counts(normalize=True).unstack().fillna(0)\r\n\r\n# \u6a2a\u68d2\u30b0\u30e9\u30d5\u3067\u8868\u793a\r\nprices_sex_distribution.plot(kind='barh', stacked=True, figsize=(10, 6))\r\nplt.title('\u4fa1\u683c\u5e2f\u3054\u3068\u306e\u6027\u5225\u5272\u5408')\r\nplt.xlabel('\u5272\u5408')\r\nplt.ylabel('\u4fa1\u683c\u5e2f')\r\nplt.legend(title='\u6027\u5225')\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u3054\u3068\u306e\u6027\u5225\u5272\u5408\u3092\u8a08\u7b97\r\nage_sex_distribution = data.groupby('age')['sex'].value_counts(normalize=True).unstack().fillna(0)\r\n\r\n# \u6a2a\u68d2\u30b0\u30e9\u30d5\u3067\u8868\u793a\r\nage_sex_distribution.sort_index().plot(kind='barh', stacked=True, figsize=(10, 6))\r\nplt.title('\u5e74\u9f62\u3054\u3068\u306e\u6027\u5225\u5272\u5408')\r\nplt.xlabel('\u5272\u5408')\r\nplt.ylabel('\u5e74\u9f62')\r\nplt.legend(title='\u6027\u5225')\r\nplt.show()\r\n\r\n# \u6027\u5225\u3054\u3068\u306e\u5229\u7528\u7528\u9014\u5272\u5408\u3092\u8a08\u7b97\r\nsex_purpose_distribution = data.groupby('sex')['purpose'].value_counts(normalize=True).unstack().fillna(0)\r\n\r\n# \u6a2a\u68d2\u30b0\u30e9\u30d5\u3067\u8868\u793a\r\nsex_purpose_distribution.plot(kind='barh', stacked=True, figsize=(10, 6))\r\nplt.title('\u6027\u5225\u3054\u3068\u306e\u5229\u7528\u7528\u9014\u5272\u5408')\r\nplt.xlabel('\u5272\u5408')\r\nplt.ylabel('\u6027\u5225')\r\nplt.legend(title='\u5229\u7528\u7528\u9014')\r\nplt.show()\r\n\r\n# \u8a55\u4fa1\u30c7\u30fc\u30bf\u304c\u6587\u5b57\u5217\u306e\u5834\u5408\u3001\u6570\u5024\u578b\u306b\u5909\u63db\r\nrating_columns = ['room_ratings', 'bath_ratings', 'breakfast_ratings', 'dinner_ratings', 'service_ratings', 'cleanliness_ratings']\r\nfor column in rating_columns:\r\ndata[column] = pd.to_numeric(data[column], errors='coerce')\r\n\r\n# \u6027\u5225\u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=rating_columns, by='sex')\r\nplt.xlabel('\u6027\u5225')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.suptitle('') # \u30b5\u30d6\u30bf\u30a4\u30c8\u30eb\u3092\u524a\u9664\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=rating_columns, by='age')\r\nplt.xlabel('\u5e74\u9f62')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.suptitle('') # \u30b5\u30d6\u30bf\u30a4\u30c8\u30eb\u3092\u524a\u9664\r\nplt.show()\r\n\r\n# \u5229\u7528\u7528\u9014\u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=rating_columns, by='purpose')\r\nplt.xlabel('\u5229\u7528\u7528\u9014')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.suptitle('') # \u30b5\u30d6\u30bf\u30a4\u30c8\u30eb\u3092\u524a\u9664\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u3068\u6027\u5225\u3067\u30b0\u30eb\u30fc\u30d7\u5316\u3057\u3001\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nfor column in rating_columns:\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=column, by=['age', 'sex'])\r\nplt.title(f'{column} - \u5e74\u9f62\u00d7\u6027\u5225\u3054\u3068\u306e\u5206\u5e03')\r\nplt.suptitle('')\r\nplt.xlabel('\u5e74\u9f62 \/ \u6027\u5225')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.xticks(rotation=45) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309245\u5ea6\u56de\u8ee2\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u3001\u6027\u5225\u3001\u5229\u7528\u7528\u9014\u3067\u30b0\u30eb\u30fc\u30d7\u5316\u3057\u3001\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nfor column in rating_columns:\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=column, by=['age', 'sex', 'purpose'])\r\nplt.title(f'{column} - \u5e74\u9f62\u00d7\u6027\u5225\u00d7\u5229\u7528\u7528\u9014\u3054\u3068\u306e\u5206\u5e03')\r\nplt.suptitle('')\r\nplt.xlabel('\u5e74\u9f62 \/ \u6027\u5225 \/ \u5229\u7528\u7528\u9014')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.xticks(rotation=45) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309245\u5ea6\u56de\u8ee2\r\nplt.show()\r\n\r\n# 'plans' \u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nfor column in rating_columns:\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=column, by='plans')\r\nplt.title(f'{column} - \u30d7\u30e9\u30f3\u3054\u3068\u306e\u5206\u5e03')\r\nplt.suptitle('')\r\nplt.xlabel('\u30d7\u30e9\u30f3')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.xticks(rotation=45) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309245\u5ea6\u56de\u8ee2\r\nplt.show()\r\n\r\n# 'prices' \u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nfor column in rating_columns:\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=column, by='prices')\r\nplt.title(f'{column} - \u4fa1\u683c\u5e2f\u3054\u3068\u306e\u5206\u5e03')\r\nplt.suptitle('')\r\nplt.xlabel('\u4fa1\u683c\u5e2f')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.xticks(rotation=45) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309245\u5ea6\u56de\u8ee2\r\nplt.show()\r\n\r\n# 'prices' \u3068\u6027\u5225\u3054\u3068\u306e\u8a55\u4fa1\u306e\u5206\u5e03\u3092\u7bb1\u3072\u3052\u56f3\u3067\u8868\u793a\r\nfor column in rating_columns:\r\nplt.figure(figsize=(12, 8))\r\ndata.boxplot(column=column, by=['prices', 'sex'])\r\nplt.title(f'{column} - \u4fa1\u683c\u5e2f\u00d7\u6027\u5225\u3054\u3068\u306e\u5206\u5e03')\r\nplt.suptitle('')\r\nplt.xlabel('\u4fa1\u683c\u5e2f \/ \u6027\u5225')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.xticks(rotation=45) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309245\u5ea6\u56de\u8ee2\r\nplt.show()<\/code><\/pre>\r\n<p>\u3053\u306e\u30b3\u30fc\u30c9\u3092\u5b9f\u884c\u3059\u308b\u3053\u3068\u3067\u4f8b\u3048\u3070\u4ee5\u4e0b\u306e\u3088\u3046\u306a\u30b0\u30e9\u30d5\u304c\u51fa\u529b\u3067\u304d\u307e\u3059\u3002<\/p>\r\n<\/div>\n\n<h2>EDA\u3092\u884c\u3046<\/h2>\n<p>\u3069\u306e\u3088\u3046\u306a\u30c7\u30fc\u30bf\u304c\u683c\u7d0d\u3055\u308c\u3066\u3044\u308b\u306e\u304b\u8efd\u304fEDA\u3092\u884c\u3044\u307e\u3057\u305f\u3002<\/p>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code># \u6027\u5225\u306e\u5272\u5408\u3092\u8a08\u7b97\r\nsex_counts = data['sex'].value_counts()\r\n\r\n# \u5186\u30b0\u30e9\u30d5\u3092\u63cf\u753b\r\nplt.figure(figsize=(8, 6))\r\nsex_counts.plot(kind='pie', autopct='%1.1f%%')\r\nplt.title('\u6027\u5225\u306e\u5272\u5408')\r\nplt.ylabel('') # Y\u8ef8\u30e9\u30d9\u30eb\u3092\u975e\u8868\u793a\u306b\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u306e\u5206\u5e03\u3092\u8a08\u7b97\r\nage_counts = data['age'].value_counts()\r\n\r\n# \u68d2\u30b0\u30e9\u30d5\u3092\u63cf\u753b\r\nplt.figure(figsize=(10, 6))\r\nage_counts.plot(kind='bar')\r\nplt.title('\u5e74\u9f62\u306e\u5206\u5e03')\r\nplt.xlabel('\u5e74\u9f62')\r\nplt.ylabel('\u4eba\u6570')\r\nplt.show()\r\n\r\npurpose_counts = data['purpose'].value_counts()\r\nplt.figure(figsize=(8, 6))\r\npurpose_counts.plot(kind='pie', autopct='%1.1f%%')\r\nplt.title('\u5229\u7528\u7528\u9014\u306e\u5272\u5408')\r\nplt.ylabel('')\r\nplt.show()\r\n\r\nroom_type_counts = data['room_type'].value_counts()\r\nplt.figure(figsize=(10, 6))\r\nroom_type_counts.plot(kind='bar')\r\nplt.title('\u90e8\u5c4b\u30bf\u30a4\u30d7\u306e\u5206\u5e03')\r\nplt.xlabel('\u90e8\u5c4b\u30bf\u30a4\u30d7')\r\nplt.ylabel('\u4eba\u6570')\r\nplt.show()\r\n\r\nmeal_type_counts = data['meal_type'].value_counts()\r\nplt.figure(figsize=(10, 6))\r\nmeal_type_counts.plot(kind='bar')\r\nplt.title('\u98df\u4e8b\u30bf\u30a4\u30d7\u306e\u5206\u5e03')\r\nplt.xlabel('\u98df\u4e8b\u30bf\u30a4\u30d7')\r\nplt.ylabel('\u4eba\u6570')\r\nplt.show()\r\n\r\n# \u4fa1\u683c\u5e2f\u306e\u30c7\u30fc\u30bf\u3092\u6570\u5024\u306b\u5909\u63db\u3059\u308b\u5fc5\u8981\u304c\u3042\u308b\u5834\u5408\u304c\u3042\u308a\u307e\u3059\r\n# \u4f8b: data['low_prices'] = pd.to_numeric(data['low_prices'], errors='coerce')\r\n\r\nplt.figure(figsize=(12, 6))\r\nplt.plot(data['low_prices'], label='\u6700\u4f4e\u4fa1\u683c')\r\nplt.plot(data['high_prices'], label='\u6700\u9ad8\u4fa1\u683c')\r\nplt.title('\u4fa1\u683c\u5e2f\u306e\u5909\u52d5')\r\nplt.xlabel('\u30b5\u30f3\u30d7\u30eb')\r\nplt.ylabel('\u4fa1\u683c')\r\nplt.legend()\r\nplt.show()\r\n\r\n# \u7bb1\u3072\u3052\u56f3\u306e\u63cf\u753b\r\n\r\n# \u8a55\u4fa1\u30c7\u30fc\u30bf\u304c\u6587\u5b57\u5217\u306e\u5834\u5408\u3001\u6570\u5024\u578b\u306b\u5909\u63db\r\ndata['room_ratings'] = pd.to_numeric(data['room_ratings'], errors='coerce')\r\ndata['bath_ratings'] = pd.to_numeric(data['bath_ratings'], errors='coerce')\r\ndata['breakfast_ratings'] = pd.to_numeric(data['breakfast_ratings'], errors='coerce')\r\ndata['dinner_ratings'] = pd.to_numeric(data['dinner_ratings'], errors='coerce')\r\ndata['service_ratings'] = pd.to_numeric(data['service_ratings'], errors='coerce')\r\ndata['cleanliness_ratings'] = pd.to_numeric(data['cleanliness_ratings'], errors='coerce')\r\n\r\n# \u7bb1\u3072\u3052\u56f3\u306e\u63cf\u753b\r\nplt.figure(figsize=(10, 6))\r\ndata[['room_ratings', 'bath_ratings', 'breakfast_ratings', 'dinner_ratings', 'service_ratings', 'cleanliness_ratings']].plot(kind='box')\r\nplt.title('\u5404\u8a55\u4fa1\u306e\u7bb1\u3072\u3052\u56f3')\r\nplt.ylabel('\u8a55\u4fa1')\r\nplt.show()\r\n\r\n# \u5e74\u9f62\u5c64\u3054\u3068\u306b\u5229\u7528\u7528\u9014\u306e\u5272\u5408\u3092\u8a08\u7b97\r\npurpose_by_age = data.groupby('age')['purpose'].value_counts(normalize=True).unstack().fillna(0)\r\n\r\n# \u6a2a\u68d2\u30b0\u30e9\u30d5\u306e\u63cf\u753b\r\npurpose_by_age.plot(kind='barh', stacked=True, figsize=(10, 6))\r\nplt.title('\u5e74\u9f62\u5c64\u3054\u3068\u306e\u5229\u7528\u7528\u9014\u306e\u5272\u5408')\r\nplt.xlabel('\u5272\u5408')\r\nplt.ylabel('\u5e74\u9f62\u5c64')\r\nplt.legend(title='\u5229\u7528\u7528\u9014')\r\nplt.show()\r\n\r\n# 'post_date' \u30ab\u30e9\u30e0\u3092\u65e5\u4ed8\u578b\u306b\u5909\u63db\r\ndata['post_date'] = pd.to_datetime(data['post_date'])\r\n\r\n# \u65e5\u4ed8\u3054\u3068\u306b\u30ec\u30d3\u30e5\u30fc\u6570\u3092\u96c6\u8a08\r\nreviews_per_day = data['post_date'].value_counts().sort_index()\r\n\r\n# \u6298\u308c\u7dda\u30b0\u30e9\u30d5\u306e\u63cf\u753b\r\nplt.figure(figsize=(10, 6))\r\nreviews_per_day.plot(kind='line')\r\nplt.title('\u65e5\u4ed8\u3054\u3068\u306e\u30ec\u30d3\u30e5\u30fc\u6295\u7a3f\u6570\u306e\u63a8\u79fb')\r\nplt.xlabel('\u65e5\u4ed8')\r\nplt.ylabel('\u30ec\u30d3\u30e5\u30fc\u6570')\r\nplt.grid(True)\r\nplt.show()\r\n\r\n# \u9031\u3054\u3068\u306b\u30ec\u30d3\u30e5\u30fc\u6570\u3092\u96c6\u8a08\r\n# 'post_date' \u3092\u9031\u306e\u59cb\u307e\u308a\uff08\u6708\u66dc\u65e5\uff09\u306b\u30de\u30c3\u30d4\u30f3\u30b0\r\ndata['week'] = data['post_date'].dt.to_period('W').apply(lambda r: r.start_time)\r\nreviews_per_week = data.groupby('week').size()\r\n\r\n# \u6298\u308c\u7dda\u30b0\u30e9\u30d5\u306e\u63cf\u753b\r\nplt.figure(figsize=(10, 6))\r\nreviews_per_week.plot(kind='line', marker='o')\r\nplt.title('\u9031\u3054\u3068\u306e\u30ec\u30d3\u30e5\u30fc\u6295\u7a3f\u6570\u306e\u63a8\u79fb')\r\nplt.xlabel('\u9031\uff08\u6708\u66dc\u65e5\u958b\u59cb\uff09')\r\nplt.ylabel('\u30ec\u30d3\u30e5\u30fc\u6570')\r\nplt.grid(True)\r\nplt.show()<\/code><\/pre>\r\n<\/div>\n\n<h2>\u983b\u51fa\u5358\u8a9e\u3092\u53ef\u8996\u5316<\/h2>\n<p>\u983b\u51fa\u5358\u8a9e\u3092\u53ef\u8996\u5316\u3057\u3066\u307f\u307e\u3059\u3002\u3057\u304b\u3057\u3001\u3053\u306e\u307e\u307e\u3060\u3068\u5206\u6790\u306b\u306f\u3042\u307e\u308a\u4f7f\u3048\u306a\u3044\u3068\u601d\u3046\u306e\u3067\u629c\u304d\u51fa\u3059\u54c1\u8a5e\u3084\u30ad\u30fc\u30ef\u30fc\u30c9\u306a\u3069\u306e\u8a2d\u5b9a\u306f\u3082\u3046\u5c11\u3057\u8003\u3048\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u306d\u3002<\/p>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>!pip install janome\r\n\r\nfrom janome.tokenizer import Tokenizer\r\nfrom collections import Counter\r\n\r\n# Janome\u306e\u5f62\u614b\u7d20\u89e3\u6790\u5668\u306e\u521d\u671f\u5316\r\ntokenizer = Tokenizer()\r\n\r\n# \u30b9\u30c8\u30c3\u30d7\u30ef\u30fc\u30c9\u306e\u8a2d\u5b9a\uff08\u72ec\u81ea\u306b\u5b9a\u7fa9\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\uff09\r\nstop_words = {'\u3053\u308c', '\u306f', '\u3067\u3059', '\u304c', '\u3002', '\u3001', '\u3057', '\u306e', '\u30fb'}\r\n\r\n# \u30ef\u30fc\u30c9\u306e\u983b\u51fa\u5206\u6790\r\nwords = []\r\nfor post in data['post_body']:\r\ntokens = tokenizer.tokenize(post)\r\nwords.extend([token.surface for token in tokens if token.surface not in stop_words and token.part_of_speech.split(',')[0] in ['\u540d\u8a5e','\u52d5\u8a5e', '\u5f62\u5bb9\u8a5e']])\r\n\r\nword_counts = Counter(words)\r\n\r\n# \u4e0a\u4f4d20\u5358\u8a9e\u3092\u53d6\u5f97\r\ntop_words = word_counts.most_common(20)\r\n\r\n# \u5358\u8a9e\u3068\u305d\u306e\u30ab\u30a6\u30f3\u30c8\u3092\u5225\u3005\u306e\u30ea\u30b9\u30c8\u306b\u5206\u3051\u308b\r\nwords, counts = zip(*top_words)\r\n\r\n# \u68d2\u30b0\u30e9\u30d5\u306e\u4f5c\u6210\r\nplt.figure(figsize=(10, 8))\r\nplt.bar(words, counts)\r\nplt.xlabel('\u5358\u8a9e')\r\nplt.ylabel('\u983b\u5ea6')\r\nplt.xticks(rotation=90) # X\u8ef8\u306e\u30e9\u30d9\u30eb\u309290\u5ea6\u56de\u8ee2\u3057\u3066\u8aad\u307f\u3084\u3059\u304f\u3059\u308b\r\nplt.title('\u4e0a\u4f4d20\u4f4d\u306e\u983b\u51fa\u5358\u8a9e')\r\nplt.show()<\/code><\/pre>\r\n<\/div>\n\n<h2>\u304a\u308f\u308a\u306b<\/h2>\n<p>\u5225\u306e\u8a18\u4e8b\u3067\u3053\u308c\u3089\u306e\u7d50\u679c\u3092Google Colaboratory<span data-token-index=\"1\" class=\"discussion-id-f849d5d4-8e47-4537-8b5f-2cea3b3daeb5 notion-enable-hover\">\u304b\u3089<\/span>Streamlit\u3067\u53ef\u8996\u5316\u3059\u308b\u65b9\u6cd5\u3092<a href=\"https:\/\/blog.since2020.jp\/data_analysis\/google-colab%e3%81%8b%e3%82%89streamlit%e3%81%a7%e3%81%98%e3%82%83%e3%82%89%e3%82%93%e3%81%ae%e5%8f%a3%e3%82%b3%e3%83%9f%e5%88%86%e6%9e%90%e7%b5%90%e6%9e%9c%e3%82%92%e5%8f%af%e8%a6%96%e5%8c%96\/\">\u3053\u3061\u3089\u306e\u8a18\u4e8b\u3067<\/a>\u89e3\u8aac\u3057\u3066\u3044\u307e\u3059\u306e\u3067\u3001\u3088\u304b\u3063\u305f\u3089\u3054\u89a7\u4e0b\u3055\u3044\u3002<!-- notionvc: 13b57ee8-16ea-41e3-b7e7-7fe1925721e5 --><\/p>\r\n<p><a href=\"https:\/\/blog.since2020.jp\/data_analysis\/google-colab%e3%81%8b%e3%82%89streamlit%e3%81%a7%e3%81%98%e3%82%83%e3%82%89%e3%82%93%e3%81%ae%e5%8f%a3%e3%82%b3%e3%83%9f%e5%88%86%e6%9e%90%e7%b5%90%e6%9e%9c%e3%82%92%e5%8f%af%e8%a6%96%e5%8c%96\/\">blog.since2020.jp\/data_analysis\/google-colab\u304b\u3089streamlit\u3067\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u5206\u6790\u7d50\u679c\u3092\u53ef\u8996\u5316\/<\/a><\/p>","protected":false},"excerpt":{"rendered":"<p>\u3058\u3083\u3089\u3093\u306e\u53e3\u30b3\u30df\u3092\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\uff06\u53ef\u8996\u5316\u3057\u3066\u307f\u305f \u306f\u3058\u3081\u306b \u3058\u3083\u3089\u3093\u306f\u53e3\u30b3\u30df\u306e\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u306f\u8a31\u53ef\u3092\u51fa\u3057\u3066\u3044\u307e\u3059\u3002\uff082024\/01\/18\u6642\u70b9) \u306a\u306e\u3067\u7df4\u7fd2\u304c\u3066\u3089\u306b\u30b9\u30af\u30ec\u30a4\u30d4\u30f3\u30b0\u3092\u3057\u3066\u307f\u307e\u3057\u3087\u3046\uff01 \u79c1\u306fBeautifulS [&hellip;]<\/p>\n","protected":false},"author":83,"featured_media":4262,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"content-type":"","swell_btn_cv_data":"","footnotes":"","_wp_rev_ctl_limit":""},"categories":[1246],"tags":[331,405,39],"class_list":["post-4390","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-data-infrastructure","tag-python","tag-405","tag-39"],"_links":{"self":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts\/4390","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/users\/83"}],"replies":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/comments?post=4390"}],"version-history":[{"count":0,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts\/4390\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/media\/4262"}],"wp:attachment":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/media?parent=4390"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/categories?post=4390"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/tags?post=4390"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}