File size: 14,538 Bytes

1816c50

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e59ad5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "\n",
    "# Define the path to the full Yelp dataset file\n",
    "full_data_path = \"yelp_academic_dataset_review.json\"\n",
    "\n",
    "# Define the path to save the sampled dataset file\n",
    "sampled_data_path = \"yelp_academic_dataset_review_sampled.json\"\n",
    "\n",
    "# Define the number of reviews to sample (adjust as needed)\n",
    "num_reviews_to_sample = 10000  # Example: Sample 10,000 reviews\n",
    "\n",
    "# Load all reviews from the full dataset\n",
    "all_reviews = []\n",
    "with open(full_data_path, \"r\", encoding=\"utf-8\") as f:\n",
    "    for line in f:\n",
    "        review = json.loads(line)\n",
    "        all_reviews.append(review)\n",
    "\n",
    "# Randomly sample a subset of reviews\n",
    "sampled_reviews = random.sample(all_reviews, num_reviews_to_sample)\n",
    "\n",
    "# Save the sampled reviews to a new JSON file\n",
    "with open(sampled_data_path, \"w\", encoding=\"utf-8\") as f:\n",
    "    for review in sampled_reviews:\n",
    "        json.dump(review, f)\n",
    "        f.write(\"\\n\")\n",
    "\n",
    "print(f\"Sampled {num_reviews_to_sample} reviews and saved to {sampled_data_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f562ff04",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "\n",
    "# Define the path to save the compressed dataset file\n",
    "compressed_data_path = \"yelp_academic_dataset_review_sampled.json.gz\"\n",
    "\n",
    "# Compress the sampled dataset file using gzip\n",
    "with open(sampled_data_path, \"rb\") as f_in:\n",
    "    with gzip.open(compressed_data_path, \"wb\") as f_out:\n",
    "        f_out.writelines(f_in)\n",
    "\n",
    "print(f\"Compressed file saved to {compressed_data_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "337f6649",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report, accuracy_score\n",
    "\n",
    "# Load the preprocessed Yelp dataset (sampled and compressed if applicable)\n",
    "data_path = \"yelp_academic_dataset_review_sampled.json.gz\"  # Adjust the path\n",
    "data = pd.read_json(data_path, lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e0936968",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>review_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>business_id</th>\n",
       "      <th>stars</th>\n",
       "      <th>useful</th>\n",
       "      <th>funny</th>\n",
       "      <th>cool</th>\n",
       "      <th>text</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f9khuhJxadQhg6CaI1cRdA</td>\n",
       "      <td>4Qijwb2RDiUGc4SBjA2lJg</td>\n",
       "      <td>nTBStZYJfHGdSZJbpaBiPA</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>I had read about this place adding a second lo...</td>\n",
       "      <td>2011-02-08 17:48:40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>WH0c1wEMu4XRTIysI7uMig</td>\n",
       "      <td>7JeW4Mlvqdp7R-FAUBB_vA</td>\n",
       "      <td>H3Tmgv94pbGvBIKZ4Rs9Cw</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>I had dinner at Tin Angel on Saturday and was ...</td>\n",
       "      <td>2012-04-16 13:30:02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>S1Lg07IGrupUDk7Uu9rnQQ</td>\n",
       "      <td>umUy5DTpVrvQDXLR4gywHA</td>\n",
       "      <td>H7BikysfQbS9bMULQsCU_Q</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>I was really excited to visit the store, havin...</td>\n",
       "      <td>2019-10-05 00:17:15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>AH4_Pua0yzK4oU9FoU8hXQ</td>\n",
       "      <td>uwYw0KKj16lC_nq_HsQGVQ</td>\n",
       "      <td>Xb6QfBbleg2aJT2cG807jQ</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>I hired Two Men and a Truck for my recent move...</td>\n",
       "      <td>2016-06-02 13:27:24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9_CIDS98p6ZsTRiCvmuIKA</td>\n",
       "      <td>l9bVKgzvjjcU8Iang3Tvtg</td>\n",
       "      <td>lqSJkyNSE1yPeux4PoR-pg</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>i was very disappointed to this company. They ...</td>\n",
       "      <td>2020-06-05 22:28:47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>5MknizHCBH3jpj5DJd-6Uw</td>\n",
       "      <td>d2VrfngFJ1f1nvNAsojJzw</td>\n",
       "      <td>hy-E7DdXbdgTbwphKUYW1w</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>This was such a trash experience. We signed up...</td>\n",
       "      <td>2021-07-29 16:10:10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9996</th>\n",
       "      <td>mXFlaWuiCnyCkZ_SIAGqew</td>\n",
       "      <td>cHWDGVf4LofBk9wZ2mnXQQ</td>\n",
       "      <td>AYWSFv6QxF5IjQSxITMUug</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>I have been going to Goshen Nail Salon for the...</td>\n",
       "      <td>2018-03-16 00:30:50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9997</th>\n",
       "      <td>W1Ij-zC3ufRU5MTEgHLjmg</td>\n",
       "      <td>aN9nWudz5rfar7rHr9lHfA</td>\n",
       "      <td>oyJ3gXNkV0DO0YxcaTgtTg</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Ok. This place surprised me. I always thought ...</td>\n",
       "      <td>2018-06-01 23:56:44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9998</th>\n",
       "      <td>HNejB5H9iD1qe3MMKxg6sg</td>\n",
       "      <td>6JejVLZl5M-IB3UkNTkXtQ</td>\n",
       "      <td>WJLKQTduGumxjlXelqiuKg</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Meets expectations, but quirky.  The trucks re...</td>\n",
       "      <td>2016-06-29 15:57:34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9999</th>\n",
       "      <td>LSJGzHJ7whqNn5uPxidMjQ</td>\n",
       "      <td>_Av1LaAAY0Y8YcPp7Ck7fg</td>\n",
       "      <td>M983OPfVRnwvG7zEOzykCA</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Jordan was our waiter. He was very attentive a...</td>\n",
       "      <td>2017-03-15 23:54:07</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                   review_id                 user_id             business_id  \\\n",
       "0     f9khuhJxadQhg6CaI1cRdA  4Qijwb2RDiUGc4SBjA2lJg  nTBStZYJfHGdSZJbpaBiPA   \n",
       "1     WH0c1wEMu4XRTIysI7uMig  7JeW4Mlvqdp7R-FAUBB_vA  H3Tmgv94pbGvBIKZ4Rs9Cw   \n",
       "2     S1Lg07IGrupUDk7Uu9rnQQ  umUy5DTpVrvQDXLR4gywHA  H7BikysfQbS9bMULQsCU_Q   \n",
       "3     AH4_Pua0yzK4oU9FoU8hXQ  uwYw0KKj16lC_nq_HsQGVQ  Xb6QfBbleg2aJT2cG807jQ   \n",
       "4     9_CIDS98p6ZsTRiCvmuIKA  l9bVKgzvjjcU8Iang3Tvtg  lqSJkyNSE1yPeux4PoR-pg   \n",
       "...                      ...                     ...                     ...   \n",
       "9995  5MknizHCBH3jpj5DJd-6Uw  d2VrfngFJ1f1nvNAsojJzw  hy-E7DdXbdgTbwphKUYW1w   \n",
       "9996  mXFlaWuiCnyCkZ_SIAGqew  cHWDGVf4LofBk9wZ2mnXQQ  AYWSFv6QxF5IjQSxITMUug   \n",
       "9997  W1Ij-zC3ufRU5MTEgHLjmg  aN9nWudz5rfar7rHr9lHfA  oyJ3gXNkV0DO0YxcaTgtTg   \n",
       "9998  HNejB5H9iD1qe3MMKxg6sg  6JejVLZl5M-IB3UkNTkXtQ  WJLKQTduGumxjlXelqiuKg   \n",
       "9999  LSJGzHJ7whqNn5uPxidMjQ  _Av1LaAAY0Y8YcPp7Ck7fg  M983OPfVRnwvG7zEOzykCA   \n",
       "\n",
       "      stars  useful  funny  cool  \\\n",
       "0         4       1      0     1   \n",
       "1         5       1      0     1   \n",
       "2         2       4      1     0   \n",
       "3         1       1      0     0   \n",
       "4         1       0      0     0   \n",
       "...     ...     ...    ...   ...   \n",
       "9995      1       1      0     0   \n",
       "9996      5       0      0     0   \n",
       "9997      5       0      0     0   \n",
       "9998      3       0      0     0   \n",
       "9999      5       0      0     0   \n",
       "\n",
       "                                                   text                date  \n",
       "0     I had read about this place adding a second lo... 2011-02-08 17:48:40  \n",
       "1     I had dinner at Tin Angel on Saturday and was ... 2012-04-16 13:30:02  \n",
       "2     I was really excited to visit the store, havin... 2019-10-05 00:17:15  \n",
       "3     I hired Two Men and a Truck for my recent move... 2016-06-02 13:27:24  \n",
       "4     i was very disappointed to this company. They ... 2020-06-05 22:28:47  \n",
       "...                                                 ...                 ...  \n",
       "9995  This was such a trash experience. We signed up... 2021-07-29 16:10:10  \n",
       "9996  I have been going to Goshen Nail Salon for the... 2018-03-16 00:30:50  \n",
       "9997  Ok. This place surprised me. I always thought ... 2018-06-01 23:56:44  \n",
       "9998  Meets expectations, but quirky.  The trucks re... 2016-06-29 15:57:34  \n",
       "9999  Jordan was our waiter. He was very attentive a... 2017-03-15 23:54:07  \n",
       "\n",
       "[10000 rows x 9 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "466ef010",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map stars to sentiment labels\n",
    "def map_sentiment(stars):\n",
    "    if stars >= 4:\n",
    "        return \"positive\"\n",
    "    elif stars <= 2:\n",
    "        return \"negative\"\n",
    "    else:\n",
    "        return \"neutral\"  # Optional: Handle neutral sentiment if needed\n",
    "\n",
    "# Apply sentiment mapping to stars\n",
    "data['sentiment'] = data['stars'].apply(map_sentiment)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "756b3285",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply sentiment mapping to stars\n",
    "data['sentiment'] = data['stars'].apply(map_sentiment)\n",
    "\n",
    "# Split the data into training and testing sets\n",
    "train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)\n",
    "\n",
    "# Save the preprocessed data\n",
    "train_data.to_csv(\"preprocessed_train_data.csv\", index=False)\n",
    "test_data.to_csv(\"preprocessed_test_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7257dd9d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install torch\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f03a2ad5",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecdcf9c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import torch\n",
    "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n",
    "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
    "\n",
    "# Load the preprocessed training and testing data\n",
    "train_data = pd.read_csv(\"preprocessed_train_data.csv\")  # Adjust the path\n",
    "test_data = pd.read_csv(\"preprocessed_test_data.csv\")  # Adjust the path\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c83718d7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}