Skip to content

Commit 3812b4d

Browse files
authored
Merge pull request #9 from soda-inria/data
New method to load data and new release 0.0.23
2 parents 1402900 + 2ab5b84 commit 3812b4d

File tree

138 files changed

+18567
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+18567
-1
lines changed

build/lib/carte_ai/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from carte_ai.src import *
2+
from carte_ai.configs import *
3+
from carte_ai.data import *
4+
from carte_ai.scripts import *
5+
from .src import CARTERegressor, CARTEClassifier, Table2GraphTransformer
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from carte_ai.configs.carte_configs import *
2+
from carte_ai.configs.directory import *
3+
from carte_ai.configs.model_parameters import *
4+
from carte_ai.configs.visuailization import *
+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
"""Specific configurations for the CARTE paper."""
2+
3+
## Dataset names
4+
carte_datalist = [
5+
"anime_planet",
6+
"babies_r_us",
7+
"beer_ratings",
8+
"bikedekho",
9+
"bikewale",
10+
"buy_buy_baby",
11+
"cardekho",
12+
"chocolate_bar_ratings",
13+
"clear_corpus",
14+
"coffee_ratings",
15+
"company_employees",
16+
"employee_remuneration",
17+
"employee_salaries",
18+
"fifa22_players",
19+
"filmtv_movies",
20+
"journal_jcr",
21+
"journal_sjr",
22+
"jp_anime",
23+
"k_drama",
24+
"michelin",
25+
"mlds_salaries",
26+
"movies",
27+
"museums",
28+
"mydramalist",
29+
"nba_draft",
30+
"prescription_drugs",
31+
"ramen_ratings",
32+
"roger_ebert",
33+
"rotten_tomatoes",
34+
"spotify",
35+
"us_accidents_counts",
36+
"us_accidents_severity",
37+
"us_presidential",
38+
"used_cars_24",
39+
"used_cars_benz_italy",
40+
"used_cars_dot_com",
41+
"used_cars_pakistan",
42+
"used_cars_saudi_arabia",
43+
"videogame_sales",
44+
"whisky",
45+
"wikiliq_beer",
46+
"wikiliq_spirit",
47+
"wina_pl",
48+
"wine_dot_com_prices",
49+
"wine_dot_com_ratings",
50+
"wine_enthusiasts_prices",
51+
"wine_enthusiasts_ratings",
52+
"wine_vivino_price",
53+
"wine_vivino_rating",
54+
"yelp",
55+
"zomato",
56+
]
57+
58+
## Dictionary of baseline methods
59+
carte_singletable_baselines = dict()
60+
carte_singletable_baselines["full"] = [
61+
"carte-gnn",
62+
"catboost",
63+
"sentence-llm-concat-num_histgb",
64+
"sentence-llm-concat-num_xgb",
65+
"sentence-llm-embed-num_histgb",
66+
"sentence-llm-embed-num_xgb",
67+
"tablevectorizer-fasttext_histgb",
68+
"tablevectorizer-fasttext_xgb",
69+
"tablevectorizer-llm_histgb",
70+
"tablevectorizer-llm_xgb",
71+
"tablevectorizer_histgb",
72+
"tablevectorizer_logistic",
73+
"tablevectorizer_mlp",
74+
"tablevectorizer_randomforest",
75+
"tablevectorizer_resnet",
76+
"tablevectorizer_ridge",
77+
"tablevectorizer_xgb",
78+
"tablevectorizer_tabpfn",
79+
"target-encoder_histgb",
80+
"target-encoder_logistic",
81+
"target-encoder_mlp",
82+
"target-encoder_randomforest",
83+
"target-encoder_resnet",
84+
"target-encoder_ridge",
85+
"target-encoder_xgb",
86+
"target-encoder_tabpfn",
87+
]
88+
89+
carte_singletable_baselines["reduced"] = [
90+
"carte-gnn",
91+
"catboost",
92+
"sentence-llm-concat-num_xgb",
93+
"sentence-llm-embed-num_xgb",
94+
"tablevectorizer_logistic",
95+
"tablevectorizer_mlp",
96+
"tablevectorizer_randomforest",
97+
"tablevectorizer_resnet",
98+
"tablevectorizer_ridge",
99+
"tablevectorizer_xgb",
100+
"target-encoder_tabpfn",
101+
]
102+
103+
carte_multitable_baselines = [
104+
"original_carte-multitable",
105+
"matched_carte-multitable",
106+
"original_catboost-multitable",
107+
"matched_catboost-multitable",
108+
"original-sentence-llm_histgb-multitable",
109+
"matched-sentence-llm_histgb-multitable",
110+
]
111+
112+
113+
## Dictionary of method mapping
114+
carte_singletable_baseline_mapping = dict()
115+
carte_singletable_baseline_mapping["carte-gnn"] = "CARTE"
116+
117+
# Preprocessings
118+
carte_singletable_baseline_mapping["tablevectorizer_"] = "TabVec-"
119+
carte_singletable_baseline_mapping["tablevectorizer-"] = "TabVec-"
120+
carte_singletable_baseline_mapping["target-encoder_"] = "TarEnc-"
121+
carte_singletable_baseline_mapping["fasttext_"] = "FT-"
122+
carte_singletable_baseline_mapping["llm_"] = "LLM-"
123+
carte_singletable_baseline_mapping["sentence-llm-concat-num_"] = "S-LLM-CN-"
124+
carte_singletable_baseline_mapping["sentence-llm-embed-num_"] = "S-LLM-EN-"
125+
126+
# Estimators
127+
carte_singletable_baseline_mapping["catboost"] = "CatBoost"
128+
carte_singletable_baseline_mapping["xgb"] = "XGB"
129+
carte_singletable_baseline_mapping["histgb"] = "HGB"
130+
carte_singletable_baseline_mapping["randomforest"] = "RF"
131+
carte_singletable_baseline_mapping["ridge"] = "Ridge"
132+
carte_singletable_baseline_mapping["logistic"] = "Logistic"
133+
carte_singletable_baseline_mapping["mlp"] = "MLP"
134+
carte_singletable_baseline_mapping["resnet"] = "ResNet"
135+
carte_singletable_baseline_mapping["tabpfn"] = "TabPFN"
136+
137+
# Bagging
138+
carte_singletable_baseline_mapping["bagging"] = "Bagging"
139+
140+
## Colors for visualization
141+
carte_singletable_color_palette = dict()
142+
carte_singletable_color_palette["CARTE"] = "C3"
143+
carte_singletable_color_palette["CatBoost"] = "C0"
144+
carte_singletable_color_palette["TabVec-XGB"] = "C1"
145+
carte_singletable_color_palette["TabVec-RF"] = "C2"
146+
carte_singletable_color_palette["TabVec-Ridge"] = "C4"
147+
carte_singletable_color_palette["TabVec-Logistic"] = "C5"
148+
carte_singletable_color_palette["S-LLM-CN-XGB"] = "C6"
149+
carte_singletable_color_palette["S-LLM-EN-XGB"] = "C7"
150+
carte_singletable_color_palette["TabVec-ResNet"] = "C8"
151+
carte_singletable_color_palette["TabVec-MLP"] = "C9"
152+
carte_singletable_color_palette["TarEnc-TabPFN"] = "#A9561E"
153+
154+
## Markers for visualization
155+
carte_singletable_markers = dict()
156+
carte_singletable_markers["CARTE"] = "o"
157+
carte_singletable_markers["TabVec-XGB"] = (4, 0, 45)
158+
carte_singletable_markers["TabVec-RF"] = "P"
159+
carte_singletable_markers["CatBoost"] = "X"
160+
carte_singletable_markers["S-LLM-CN-XGB"] = (4, 0, 0)
161+
carte_singletable_markers["S-LLM-EN-XGB"] = "d"
162+
carte_singletable_markers["TabVec-Ridge"] = "v"
163+
carte_singletable_markers["TabVec-Logistic"] = "v"
164+
carte_singletable_markers["TabVec-ResNet"] = "^"
165+
carte_singletable_markers["TabVec-MLP"] = "p"
166+
carte_singletable_markers["TarEnc-TabPFN"] = (5, 1, 0)
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from pathlib import Path
2+
3+
# Get the base path relative to this file's location
4+
base_path = Path(__file__).resolve().parent.parent # This gives '/home/infres/gbrison/carte/carte_ai'
5+
6+
config_directory = dict()
7+
config_directory["base_path"] = base_path
8+
9+
config_directory["data"] = str(base_path / "data/")
10+
config_directory["pretrained_model"] = str(base_path / "data/etc/kg_pretrained.pt") # Correct path
11+
config_directory["data_raw"] = str(base_path / "data/data_raw/")
12+
config_directory["data_singletable"] = str(base_path / "data/data_singletable/")
13+
config_directory["data_yago"] = str(base_path / "data/data_yago/")
14+
config_directory["etc"] = str(base_path / "data/etc/")
15+
16+
config_directory["results"] = str(base_path / "results/")
17+
config_directory["compiled_results"] = str(base_path / "results/compiled_results/")
18+
config_directory["visualization"] = str(base_path / "visualization/")
19+
20+
# Specify the directory in which you have downloaded each
21+
config_directory["fasttext"] = str(base_path / "data/etc/cc.en.300.bin")
22+
config_directory["ken_embedding"] = str(base_path / "data/etc/ken_embedding.parquet")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""
2+
Parameter distributions for hyperparameter optimization
3+
"""
4+
5+
import numpy as np
6+
from scipy.stats import loguniform, randint, uniform, norm
7+
import copy
8+
9+
10+
class loguniform_int:
11+
"""Integer valued version of the log-uniform distribution"""
12+
13+
def __init__(self, a, b):
14+
self._distribution = loguniform(a, b)
15+
16+
def rvs(self, *args, **kwargs):
17+
"""Random variable sample"""
18+
return self._distribution.rvs(*args, **kwargs).astype(int)
19+
20+
21+
class norm_int:
22+
"""Integer valued version of the normal distribution"""
23+
24+
def __init__(self, a, b):
25+
self._distribution = norm(a, b)
26+
27+
def rvs(self, *args, **kwargs):
28+
"""Random variable sample"""
29+
if self._distribution.rvs(*args, **kwargs).astype(int) < 1:
30+
return 1
31+
else:
32+
return self._distribution.rvs(*args, **kwargs).astype(int)
33+
34+
35+
param_distributions_total = dict()
36+
37+
# carte-gnn
38+
param_distributions = dict()
39+
lr_grid = [1e-4, 2.5e-4, 5e-4, 7.5e-4, 1e-3]
40+
param_distributions["learning_rate"] = lr_grid
41+
param_distributions_total["carte-gnn"] = param_distributions
42+
43+
# histgb
44+
param_distributions = dict()
45+
param_distributions["learning_rate"] = loguniform(1e-2, 10)
46+
param_distributions["max_depth"] = [None, 2, 3, 4]
47+
param_distributions["max_leaf_nodes"] = norm_int(31, 5)
48+
param_distributions["min_samples_leaf"] = norm_int(20, 2)
49+
param_distributions["l2_regularization"] = loguniform(1e-6, 1e3)
50+
param_distributions_total["histgb"] = param_distributions
51+
52+
# catboost
53+
param_distributions = dict()
54+
param_distributions["max_depth"] = randint(2, 11)
55+
param_distributions["learning_rate"] = loguniform(1e-5, 1)
56+
param_distributions["bagging_temperature"] = uniform(0, 1)
57+
param_distributions["l2_leaf_reg"] = loguniform(1, 10)
58+
param_distributions["iterations"] = randint(400, 1001)
59+
param_distributions["one_hot_max_size"] = randint(2, 26)
60+
param_distributions_total["catboost"] = param_distributions
61+
62+
# xgb
63+
param_distributions = dict()
64+
param_distributions["n_estimators"] = randint(50, 1001)
65+
param_distributions["max_depth"] = randint(2, 11)
66+
param_distributions["min_child_weight"] = loguniform(1, 100)
67+
param_distributions["subsample"] = uniform(0.5, 1 - 0.5)
68+
param_distributions["learning_rate"] = loguniform(1e-5, 1)
69+
param_distributions["colsample_bylevel"] = uniform(0.5, 1 - 0.5)
70+
param_distributions["colsample_bytree"] = uniform(0.5, 1 - 0.5)
71+
param_distributions["gamma"] = loguniform(1e-8, 7)
72+
param_distributions["lambda"] = loguniform(1, 4)
73+
param_distributions["alpha"] = loguniform(1e-8, 100)
74+
param_distributions_total["xgb"] = param_distributions
75+
76+
# RandomForest
77+
param_distributions = dict()
78+
param_distributions["n_estimators"] = randint(50, 250)
79+
param_distributions["max_depth"] = [None, 2, 3, 4]
80+
param_distributions["max_features"] = [
81+
"sqrt",
82+
"log2",
83+
None,
84+
0.1,
85+
0.2,
86+
0.3,
87+
0.4,
88+
0.5,
89+
0.6,
90+
0.7,
91+
0.8,
92+
0.9,
93+
]
94+
param_distributions["min_samples_leaf"] = loguniform_int(0.5, 50.5)
95+
param_distributions["bootstrap"] = [True, False]
96+
param_distributions["min_impurity_decrease"] = [0.0, 0.01, 0.02, 0.05]
97+
param_distributions_total["randomforest"] = param_distributions
98+
99+
100+
# resnet
101+
param_distributions = dict()
102+
param_distributions["normalization"] = ["batchnorm", "layernorm"]
103+
param_distributions["num_layers"] = randint(1, 9)
104+
param_distributions["hidden_dim"] = randint(32, 513)
105+
param_distributions["hidden_factor"] = randint(1, 3)
106+
param_distributions["hidden_dropout_prob"] = uniform(0.0, 0.5)
107+
param_distributions["residual_dropout_prob"] = uniform(0.0, 0.5)
108+
param_distributions["learning_rate"] = loguniform(1e-5, 1e-2)
109+
param_distributions["weight_decay"] = loguniform(1e-8, 1e-2)
110+
param_distributions["batch_size"] = [16, 32]
111+
param_distributions_total["resnet"] = param_distributions
112+
113+
# mlp
114+
param_distributions = dict()
115+
param_distributions["hidden_dim"] = [2**x for x in range(4, 11)]
116+
param_distributions["num_layers"] = randint(1, 5)
117+
param_distributions["dropout_prob"] = uniform(0.0, 0.5)
118+
param_distributions["learning_rate"] = loguniform(1e-5, 1e-2)
119+
param_distributions["weight_decay"] = loguniform(1e-8, 1e-2)
120+
param_distributions["batch_size"] = [16, 32]
121+
param_distributions_total["mlp"] = param_distributions
122+
123+
# ridge regression
124+
param_distributions = dict()
125+
param_distributions["solver"] = ["svd", "cholesky", "lsqr", "sag"]
126+
param_distributions["alpha"] = loguniform(1e-5, 100)
127+
param_distributions_total["ridge"] = param_distributions
128+
129+
# logistic regression
130+
param_distributions = dict()
131+
param_distributions["solver"] = ["newton-cg", "lbfgs", "liblinear"]
132+
param_distributions["penalty"] = ["none", "l1", "l2", "elasticnet"]
133+
param_distributions["C"] = loguniform(1e-5, 100)
134+
param_distributions_total["logistic"] = param_distributions
135+
136+
# tabpfn
137+
param_distributions = dict()
138+
param_distributions_total["tabpfn"] = param_distributions
139+
140+
# catboost-multitable
141+
param_distributions = copy.deepcopy(param_distributions_total["catboost"])
142+
param_distributions["source_fraction"] = uniform(0, 1)
143+
param_distributions_total["catboost-multitable"] = param_distributions
144+
145+
# histgb-multitable
146+
param_distributions = copy.deepcopy(param_distributions_total["histgb"])
147+
param_distributions["source_fraction"] = uniform(0, 1)
148+
param_distributions_total["histgb-multitable"] = param_distributions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Visualization configurations
3+
"""
4+
5+
# Main models
6+
model_color_palette = dict()
7+
model_color_palette["CARTE"] = "C3"
8+
model_color_palette["CatBoost"] = "C0"
9+
model_color_palette["TabVec-XGB"] = "C1"
10+
model_color_palette["TabVec-RF"] = "C2"
11+
model_color_palette["TabVec-Ridge"] = "C4"
12+
model_color_palette["TabVec-Logistic"] = "C5"
13+
model_color_palette["S-LLM-CN-XGB"] = "C6" # ""
14+
model_color_palette["S-LLM-EN-XGB"] = "C7" # "C7" "#C875C4" mediumorchid
15+
model_color_palette["ResNet"] = "C8"
16+
model_color_palette["MLP"] = "C9"
17+
model_color_palette["TabPFN"] = "#A9561E"
18+
19+
model_color_palette["TabVec-RandomForest"] = "C2"
20+
model_color_palette["TabVec-ResNet"] = "C8"
21+
model_color_palette["TabVec-MLP"] = "C9"
22+
model_color_palette["TarEnc-TabPFN"] = "#A9561E"
23+
24+
25+
# model_color_palette["CARTE-B"] = "C3"
26+
# model_color_palette["CatBoost-B"] = "C0"
27+
# model_color_palette["TabVec-XGB-B"] = "C1"
28+
# model_color_palette["TabVec-RF-B"] = "C2"
29+
# model_color_palette["TabVec-Ridge-B"] = "C4"
30+
# model_color_palette["TabVec-Logistic-B"] = "C5"
31+
# model_color_palette["S-LLM-CN-XGB-B"] = "C6"
32+
# model_color_palette["S-LLM-EN-XGB-B"] = "C7"
33+
# model_color_palette["ResNet-B"] = "C8"
34+
# model_color_palette["MLP-B"] = "C9"
35+
# model_color_palette["TabPFN-B"] = "#A9561E"
36+
37+
38+
# model_color_palette["TabVec-HGB"] = "#650021"
39+
# model_color_palette["TabVec-TabPFN"] = "#650021"
40+
# model_color_palette["TabVec-FT-XGB"] = "#650021"
41+
# model_color_palette["TabVec-FT-HGB"] = "#650021"
42+
43+
# model_color_palette["TabLLM"] = "#653700"

build/lib/carte_ai/data/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from carte_ai.data.load_data import *

0 commit comments

Comments
 (0)