import argparse
import re
from pathlib import Path
from urllib.parse import quote

import numpy as np
import pandas as pd
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler


def load_data(path: Path) -> pd.DataFrame:
    if path.suffix.lower() == '.csv':
        for enc in ('utf-8', 'utf-8-sig', 'gbk', 'gb18030'):
            try:
                return pd.read_csv(path, encoding=enc)
            except Exception:
                continue
        raise ValueError(f'Cannot read CSV: {path}')
    if path.suffix.lower() in {'.xlsx', '.xls'}:
        return pd.read_excel(path)
    raise ValueError(f'Unsupported file type: {path}')


def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    rename = {
        'area': '面积', 'description': '房源描述', 'title': '标题', 'location': '地点',
        'house_type': '房屋类型', 'house_code': '房源编号', 'price': '价格', 'tags': '房源标签',
        'lease': '租赁方式', 'orientation': '朝向', 'floor': '楼层', 'elevator': '电梯',
        'stall': '车位', 'water': '用水', 'electricity': '用电', 'fuel_gas': '燃气',
        'heating': '采暖', 'facility': '配套设施'
    }
    return df.rename(columns=rename)


def extract_numeric(row: pd.Series) -> pd.Series:
    area = re.findall(r'\d+', str(row.get('面积', '')))
    floor = re.findall(r'\d+', str(row.get('楼层', '')))
    price = re.findall(r'\d+', str(row.get('价格', '')))
    row['面积'] = int(area[0]) if area else np.nan
    row['楼层'] = int(floor[0]) if floor else np.nan
    row['价格'] = int(price[0]) if price else np.nan
    return row


def extract_house_type(row: pd.Series) -> pd.Series:
    s = str(row.get('房屋类型', ''))
    room = re.findall(r'\d+室', s)
    hall = re.findall(r'\d+厅', s)
    toilet = re.findall(r'\d+卫', s)
    row['室'] = int(re.findall(r'\d+', room[0])[0]) if room else np.nan
    row['厅'] = int(re.findall(r'\d+', hall[0])[0]) if hall else np.nan
    row['卫'] = int(re.findall(r'\d+', toilet[0])[0]) if toilet else np.nan
    return row


def expand_facility(row: pd.Series, facility_columns) -> pd.Series:
    value = row.get('配套设施', np.nan)
    if pd.notna(value):
        for item in str(value).split(','):
            item = item.strip()
            if item in facility_columns:
                row[item] = 1
    return row


def geocode_baidu(address: str, ak: str):
    url = 'http://api.map.baidu.com/geocoding/v3/'
    uri = f"{url}?address={quote(address)}&output=json&ak={ak}&city={quote('上海')}"
    resp = requests.get(uri, timeout=10)
    resp.raise_for_status()
    data = resp.json()
    if data.get('status') != 0:
        raise ValueError(f'Baidu geocoding failed: {data}')
    loc = data['result']['location']
    return loc['lat'], loc['lng']


def maybe_add_lat_lng(df: pd.DataFrame, use_geocode: bool, baidu_ak: str | None, cache_path: Path):
    if not use_geocode:
        return df
    if not baidu_ak:
        raise ValueError('--use-geocode 时必须提供 --baidu-ak')

    cache = {}
    if cache_path.exists():
        cache_df = pd.read_csv(cache_path)
        cache = {r['地点']: (r['纬度'], r['经度']) for _, r in cache_df.iterrows()}

    lats, lngs, updated = [], [], False
    for address in df['地点'].astype(str):
        if address in cache:
            lat, lng = cache[address]
        else:
            lat, lng = geocode_baidu(address, baidu_ak)
            cache[address] = (lat, lng)
            updated = True
        lats.append(lat)
        lngs.append(lng)

    df['纬度'] = lats
    df['经度'] = lngs

    if updated:
        cache_out = pd.DataFrame([{'地点': k, '纬度': v[0], '经度': v[1]} for k, v in cache.items()])
        cache_out.to_csv(cache_path, index=False)
    return df


def compatible_onehot_fit_transform(df: pd.DataFrame):
    try:
        enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    except TypeError:
        enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
    return enc.fit_transform(df), enc


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', required=True)
    parser.add_argument('--out', default='rent_prepared.npz')
    parser.add_argument('--prepared-csv', default='rent_prepared_table.csv')
    parser.add_argument('--use-geocode', action='store_true')
    parser.add_argument('--baidu-ak', default=None)
    parser.add_argument('--cache', default='geocode_cache.csv')
    parser.add_argument('--dedup-mode', default='auto', choices=['auto', 'house_code', 'title_location_price', 'none'])
    args = parser.parse_args()

    df = load_data(Path(args.data))
    print(f'raw rows: {len(df)}')
    df = rename_columns(df)
    if '房屋类型' in df.columns:
        df = df[df['房屋类型'] != 'house_type'].copy()

    if args.dedup_mode == 'house_code':
        if '房源编号' not in df.columns or df['房源编号'].isna().all():
            raise ValueError('房源编号列为空，不能按 house_code 去重')
        df = df.drop_duplicates(subset=['房源编号'])
        print('dedup by 房源编号')
    elif args.dedup_mode == 'title_location_price':
        df = df.drop_duplicates(subset=['标题', '地点', '价格'])
        print('dedup by 标题+地点+价格')
    elif args.dedup_mode == 'auto':
        if '房源编号' in df.columns and not df['房源编号'].fillna('').eq('').all():
            df = df.drop_duplicates(subset=['房源编号'])
            print('dedup by 房源编号')
        else:
            df = df.drop_duplicates(subset=['标题', '地点', '价格'])
            print('dedup by 标题+地点+价格 (fallback)')
    else:
        print('skip dedup')

    print(f'rows after dedup: {len(df)}')
    df = df.apply(extract_numeric, axis=1)
    df = df.apply(extract_house_type, axis=1)
    df['区'] = df['地点'].astype(str).map(lambda x: x.split('-')[0] if '-' in x else x)

    facility_columns = ['洗衣机', '空调', '衣柜', '电视', '冰箱', '热水器', '床', '暖气', '宽带', '天然气']
    for col in facility_columns:
        df[col] = 0
    df = df.apply(lambda row: expand_facility(row, facility_columns), axis=1)

    df = maybe_add_lat_lng(df, args.use_geocode, args.baidu_ak, Path(args.cache))

    model_data = df[(df['价格'] >= 3000) & (df['价格'] <= 12000)].copy()
    print(f'model rows: {len(model_data)}')

    numeric_cols = ['面积', '室', '厅', '卫', '楼层']
    if args.use_geocode:
        numeric_cols += ['经度', '纬度']

    category_cols = [
        '区', '朝向', '租赁方式', '燃气', '采暖', '用水', '用电', '电梯', '车位',
        '洗衣机', '空调', '衣柜', '电视', '冰箱', '热水器', '床', '暖气', '宽带', '天然气'
    ]

    model_data['房源标签'] = model_data.get('房源标签', '').fillna('')
    scaler = StandardScaler()
    X_num = scaler.fit_transform(model_data[numeric_cols])
    X_cat, _ = compatible_onehot_fit_transform(model_data[category_cols])
    X_text = TfidfVectorizer().fit_transform(model_data['房源标签'].map(lambda x: ' '.join(str(x).split(',')))).toarray()
    X = np.concatenate([X_num, X_cat, X_text], axis=1)
    X = np.nan_to_num(X)
    y = model_data['价格'].to_numpy() / 10000.0

    model_data['area_cut'] = pd.cut(model_data['面积'], [0, 50, 100, 150, np.inf], labels=False)
    model_data = model_data[model_data['area_cut'].notna()].copy()
    model_data['area_cut'] = model_data['area_cut'].astype(int)

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=27)
    for train_idx, test_idx in sss.split(model_data, model_data['area_cut']):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

    np.savez_compressed(args.out, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    model_data.to_csv(args.prepared_csv, index=False)
    print(f'saved npz: {args.out}')
    print(f'saved prepared table: {args.prepared_csv}')
    print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')


if __name__ == '__main__':
    main()
