{"id":3632,"date":"2023-07-20T17:19:42","date_gmt":"2023-07-20T08:19:42","guid":{"rendered":"https:\/\/blog.since2020.jp\/?p=3632"},"modified":"2023-07-20T17:19:42","modified_gmt":"2023-07-20T08:19:42","slug":"%e5%95%86%e5%93%81%e3%83%ac%e3%83%93%e3%83%a5%e3%83%bc%e3%83%87%e3%83%bc%e3%82%bf%e3%81%8b%e3%82%89tf-idf%e3%83%99%e3%82%af%e3%83%88%e3%83%ab%e3%82%92%e6%8a%bd%e5%87%ba%e3%81%97%e3%80%81json%e3%83%95","status":"publish","type":"post","link":"https:\/\/since2020.jp\/media\/%e5%95%86%e5%93%81%e3%83%ac%e3%83%93%e3%83%a5%e3%83%bc%e3%83%87%e3%83%bc%e3%82%bf%e3%81%8b%e3%82%89tf-idf%e3%83%99%e3%82%af%e3%83%88%e3%83%ab%e3%82%92%e6%8a%bd%e5%87%ba%e3%81%97%e3%80%81json%e3%83%95\/","title":{"rendered":"\u5546\u54c1\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u304b\u3089TF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u62bd\u51fa\u3057\u3001JSON\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u51fa\u3059\u65b9\u6cd5"},"content":{"rendered":"\n<p>Vertex AI\u00a0Matching Engine\u4f7f\u7528\u306b\u304a\u3051\u308b\u524d\u51e6\u7406\u306e\u4e00\u3064\u3068\u3057\u3066\u3001\u5546\u54c1\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u304b\u3089TF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u62bd\u51fa\u3057\u3001JSON\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u51fa\u3059\u65b9\u6cd5\u306b\u3064\u3044\u3066\u89e3\u8aac\u3057\u3066\u3044\u307e\u3059\u3002<\/p>\n\n\n<h2>\u306f\u3058\u3081\u306b<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">\u3053\u306e\u8a18\u4e8b\u3067\u306f\u3001Amazon\u306e\u5546\u54c1\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u304b\u3089TF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u62bd\u51fa\u3057\u3001\u305d\u308c\u3089\u3092JSON\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u51fa\u3059\u65b9\u6cd5\u3092\u7d39\u4ecb\u3057\u307e\u3059\u3002\u3053\u306e\u65b9\u6cd5\u306f\u3001\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u3092\u7279\u5fb4\u30d9\u30af\u30c8\u30eb\u3068\u3057\u3066\u8868\u73fe\u3057\u3001\u305d\u308c\u3092Vertex AI\u00a0Matching Engine\u3067\u4f7f\u7528\u3059\u308b\u305f\u3081\u306e\u4e00\u4f8b\u3067\u3059\u3002<\/span>\n\n<h2>\u5fc5\u8981\u306a\u30e9\u30a4\u30d6\u30e9\u30ea\u306e\u30a4\u30f3\u30dd\u30fc\u30c8<\/h2>\n\u307e<span style=\"font-family: arial, helvetica, sans-serif\">\u305a\u306f\u5fc5\u8981\u306a\u30e9\u30a4\u30d6\u30e9\u30ea\u3092\u30a4\u30f3\u30dd\u30fc\u30c8\u3057\u307e\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>import json \r\nfrom sklearn.feature_extraction.text import TfidfVectorizer\r\n<\/code><\/pre>\r\n<\/div>\n\n<h2>\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u306e\u8aad\u307f\u8fbc\u307f<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">\u6b21\u306b\u3001<span data-token-index=\"1\" class=\"discussion-level-1 discussion-id-c7b3055f-dc51-4670-b285-7f90f3f73747 notion-enable-hover\">kaggle\u3067\u516c\u958b\u3055\u308c\u3066\u3044\u308b<\/span><\/span><span style=\"font-family: arial, helvetica, sans-serif;font-size: 16px\"><a href=\"https:\/\/www.kaggle.com\/datasets\/lokeshparab\/amazon-products-dataset\">Amazon Products Sales Dataset 2023<\/a>\u304b\u3089\u3001\u4efb\u610f\u306e\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3092\u53d6\u5f97\u3057\u3001<\/span><span style=\"font-family: arial, helvetica, sans-serif\"><span data-token-index=\"1\" class=\"discussion-level-1 discussion-id-c7b3055f-dc51-4670-b285-7f90f3f73747 notion-enable-hover\">amazon.json\u3068\u3057\u3066<\/span>\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u3092\u8aad\u307f\u8fbc\u307f\u307e\u3059\u3002\u3053\u306e\u30d5\u30a1\u30a4\u30eb\u306f\u5404\u30ec\u30d3\u30e5\u30fc\u306e\u60c5\u5831\u3092\u542b\u3080JSON\u30aa\u30d6\u30b8\u30a7\u30af\u30c8\u306e\u30ea\u30b9\u30c8\u3067\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>with open(\"amazon.json\", \"r\") as f:\r\n\u3000\u3000\u3000amazon_records = [json.loads(line) for line in f]\r\n<\/code><\/pre>\r\n<\/div>\n\n<h2>\u57cb\u3081\u8fbc\u307f\u306e\u62bd\u51fa<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">\u8aad\u307f\u8fbc\u3093\u3060\u30c7\u30fc\u30bf\u304b\u3089\u5404\u30ec\u30d3\u30e5\u30fc\u306e\u57cb\u3081\u8fbc\u307f\uff08\u7279\u5fb4\u30d9\u30af\u30c8\u30eb\uff09\u3092\u62bd\u51fa\u3057\u307e\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>embeddings = [record[0][\"embedding\"] for record in amazon_records]\r\n<\/code><\/pre>\r\n<\/div>\n\n<h2>TF-IDF\u30d9\u30af\u30c8\u30eb\u306e\u8a08\u7b97<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">TF-IDF\u30d9\u30af\u30c8\u30eb\u5316\u5668\u3092\u521d\u671f\u5316\u3057\u3001\u57cb\u3081\u8fbc\u307f\u304b\u3089TF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u8a08\u7b97\u3057\u307e\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>vectorizer = TfidfVectorizer()\r\ntfidf_vectors = vectorizer.fit_transform(embeddings)\r\n<\/code><\/pre>\r\n<\/div>\n\n<h2>\u65b0\u3057\u3044JSON\u30d5\u30a1\u30a4\u30eb\u306e\u4f5c\u6210<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">\u6700\u5f8c\u306b\u3001\u751f\u6210\u3057\u305fTF-IDF\u30d9\u30af\u30c8\u30eb\u3068\u305d\u308c\u306b\u5bfe\u5fdc\u3059\u308b\u30ec\u30d3\u30e5\u30fc\u306eID\u3092\u542b\u3080\u65b0\u3057\u3044JSON\u30d5\u30a1\u30a4\u30eb\u3092\u4f5c\u6210\u3057\u307e\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>with open(\"amazon_embeddings.json\", \"w\") as f:\r\n\u3000\u3000for i, vector in enumerate(tfidf_vectors.toarray()):\r\n\u3000\u3000\u3000\u3000json_record = {\"id\": str(i), \"embedding\": vector.tolist()}\r\n\u3000\u3000\u3000\u3000f.write(json.dumps(json_record))\r\n\u3000\u3000\u3000\u3000f.write(\"\\\\n\")\r\n<\/code><\/pre>\r\n<\/div>\r\n<span style=\"font-family: arial, helvetica, sans-serif\">&#8220;amazon_embeddings.json&#8221;\u30d5\u30a1\u30a4\u30eb\u304c\u751f\u6210\u3055\u308c\u307e\u3059\u3002\u3053\u306e\u30d5\u30a1\u30a4\u30eb\u306f\u5404\u30ec\u30d3\u30e5\u30fc\u306eTF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u542b\u307f\u3001Vertex AI Matching Engine\u3067\u4f7f\u7528\u3067\u304d\u307e\u3059\u3002<\/span>\r\n<div class=\"hcb_wrap\">\r\n<pre class=\"prism line-numbers lang-python\" data-lang=\"Python\"><code>{\"id\": \"0\", \"embedding\": [0.9437977967287928, 0.5667641899240523,,,,,,,,,<\/code><\/pre>\r\n<\/div>\n\n<h2>\u6700\u5f8c\u306b<\/h2>\n<span style=\"font-family: arial, helvetica, sans-serif\">\u3053\u306e\u624b\u9806\u306fjson\u30c7\u30fc\u30bf\u5f62\u5f0f\u306b\u57fa\u3065\u3044\u3066\u3044\u307e\u3059\u3002\u4ed6\u306e\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u3067\u4f7f\u7528\u3059\u308b\u5834\u5408\u3001\u30c7\u30fc\u30bf\u5f62\u5f0f\u306b\u5408\u308f\u305b\u3066\u30b3\u30fc\u30c9\u3092\u8abf\u6574\u3059\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002\u307e\u305f\u3001TF-IDF\u30d9\u30af\u30c8\u30eb\u5316\u306e\u8a2d\u5b9a\u3082\u3001\u5177\u4f53\u7684\u306a\u30bf\u30b9\u30af\u3084\u30c7\u30fc\u30bf\u30bb\u30c3\u30c8\u306b\u5fdc\u3058\u3066\u8abf\u6574\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002<\/span>","protected":false},"excerpt":{"rendered":"<p>Vertex AI\u00a0Matching Engine\u4f7f\u7528\u306b\u304a\u3051\u308b\u524d\u51e6\u7406\u306e\u4e00\u3064\u3068\u3057\u3066\u3001\u5546\u54c1\u30ec\u30d3\u30e5\u30fc\u30c7\u30fc\u30bf\u304b\u3089TF-IDF\u30d9\u30af\u30c8\u30eb\u3092\u62bd\u51fa\u3057\u3001JSON\u30d5\u30a1\u30a4\u30eb\u306b\u66f8\u304d\u51fa\u3059\u65b9\u6cd5\u306b\u3064\u3044\u3066\u89e3\u8aac\u3057\u3066\u3044\u307e\u3059\u3002 \u306f\u3058\u3081\u306b \u3053\u306e\u8a18\u4e8b\u3067\u306f\u3001Ama [&hellip;]<\/p>\n","protected":false},"author":87,"featured_media":2991,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"content-type":"","swell_btn_cv_data":"","footnotes":"","_wp_rev_ctl_limit":""},"categories":[1249],"tags":[96,224,469],"class_list":["post-3632","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-knowledge","tag-ai","tag-vertex-ai-matching-engine","tag-vertexai"],"_links":{"self":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts\/3632","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/users\/87"}],"replies":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/comments?post=3632"}],"version-history":[{"count":0,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/posts\/3632\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/media\/2991"}],"wp:attachment":[{"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/media?parent=3632"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/categories?post=3632"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/since2020.jp\/media\/wp-json\/wp\/v2\/tags?post=3632"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}