|
7 | 7 | [clojure.string :as str] |
8 | 8 | [java-time.api :as jt] |
9 | 9 | [fastmath.stats :as stats] |
10 | | - [scicloj.kindly.v4.kind :as kind])) |
| 10 | + [scicloj.kindly.v4.kind :as kind] |
| 11 | + [data-analysis.book-sales-analysis.data-sources-v2 :as data])) |
11 | 12 |
|
12 | 13 | ;; ## Data Transformation Functions |
13 | 14 | ;; Common data processing functions used across multiple analysis files |
14 | 15 |
|
15 | | -;; ### Date Time |
16 | | - |
17 | | -(def end-time |
18 | | - (jt/local-date 2025 10 1)) |
19 | | - |
20 | 16 | ;; ### Scicloj Helpers |
21 | 17 |
|
22 | 18 | (defn merge-csvs [file-list options] |
|
118 | 114 | (map #(str/replace % #"\+" "")) |
119 | 115 | (map #(str/trim %)) |
120 | 116 | (map sanitize-str) |
121 | | - (map #(str/replace % #"\-\-.+$" "")) ;; zdvojené názvy |
122 | | - (map #(str/replace % #"\-+$" "")) ;; pomlčky na konci |
| 117 | + (map #(str/replace % #"\-\-.+$" "")) |
| 118 | + (map #(str/replace % #"\-+$" "")) |
123 | 119 | (map #(str/replace % #"^3" "k3")) |
124 | | - (map #(str/replace % #"^5" "k5"));; eliminace čísel 3 na začátku dvou knih |
| 120 | + (map #(str/replace % #"^5" "k5")) |
125 | 121 | (remove (fn [item] (some (fn [substr] (str/includes? (name item) substr)) |
126 | 122 | ["balicek" "poukaz" "zapisnik" "limitovana-edice" "taska" "aktualizovane-vydani" "cd" "puvodni-vydani/neni-skladem" |
127 | 123 | "merch"]))) |
128 | 124 | distinct |
129 | 125 | (mapv keyword)) |
130 | 126 | nil)) |
131 | 127 |
|
132 | | -;; ### Melvil Data Enriching and Convenience Functions |
133 | | - |
134 | | -(defn months-between "Calculate how many months a product has been on market" |
135 | | - [start-date end-date] |
136 | | - (let [days (if (and start-date end-date) |
137 | | - (jt/time-between start-date end-date :days) |
138 | | - 0)] |
139 | | - (long (Math/round (/ days 30.4375))))) |
140 | | - |
141 | | -(defn months-on-market |
142 | | - "Months `book` is on a market. Zero if not at all." |
143 | | - [books-ds book end-date] |
144 | | - (let [date (try |
145 | | - (-> books-ds |
146 | | - (tc/select-columns [:titul :datum-zahajeni-prodeje]) |
147 | | - (tc/select-rows #(str/starts-with? (name (:titul %)) (name book))) |
148 | | - (tc/get-entry :datum-zahajeni-prodeje 0)) |
149 | | - (catch Exception e nil)) |
150 | | - month (if (nil? date) 0 (months-between date end-date))] |
151 | | - month)) |
| 128 | +;; ### Metadata Enriching and Convenience Functions |
152 | 129 |
|
153 | 130 | (defn czech-author? [book-title] |
154 | 131 | (let [czech-books #{:k30-hodin |
|
182 | 159 | (rand-int 2) |
183 | 160 | (if (contains? czech-books (keyword book-title)) 1 0)))) |
184 | 161 |
|
185 | | -(def category-enrichments |
186 | | - {:k30-hodin "podnikani,firemni-kultura" |
187 | | - :k5-principu-rodicovstvi "vzdelavani-a-vychova,kariera,psychologie,mezilidska-komunikace" |
188 | | - :autismus-bez-masky "psychologie,spolecnost,mezilidska-komunikace" |
189 | | - :genialne-potraviny :zdravi |
190 | | - :genialni-potraviny :zdravi |
191 | | - :jak-zabranit-dalsi-pandemii "budoucnost,spolecnost,ekologie" |
192 | | - :krvavy-uterek :historie |
193 | | - :male-experimenty "kariera,produktivita,psychologie" |
194 | | - :myty-a-nadeje-digitalniho-sveta "budoucnost,spolecnost,kariera" |
195 | | - :nestekej-na-sveho-psa "vzdelavani-a-vychova,mezilidska-komunikace" |
196 | | - :nove-zbrane-vlivu "podnikani,psychologie,kariera,mezilidska-komunikace" |
197 | | - :pamet "zdravi,psychologie" |
198 | | - :pomala-produktivita "produktivita,kariera" |
199 | | - :poridte-si-druhy-mozek "produktivita,kariera" |
200 | | - :prezit "zdravi,psychologie,mezilidska-komunikace" |
201 | | - :stastnejsi "psychologie,zdravi,mezilidska-komunikace" |
202 | | - :ultrazpracovani-lide :zdravi |
203 | | - :vitamin-l "psychologie,mezilidska-komunikace" |
204 | | - :zazracna-imunita :zdravi |
205 | | - :heureka! "podnikani,firemni-kultura" |
206 | | - :zrozeni-evropanu "historie,spolecnost"}) |
207 | | - |
208 | | -(defn enrich-metadata-csv |
209 | | - "Takes a CSV `file` with book titles, subtitles, categories and technical parameters |
210 | | - and enriches it with supplemental categories and info about author nationalities." |
211 | | - [file] |
212 | | - (let [summary-raw-ds (-> (tc/dataset |
213 | | - file |
214 | | - {:key-fn #(keyword (sanitize-column-name-str %)) |
215 | | - :separator \; |
216 | | - :parser-fn {:datum-zahajeni-prodeje :string} |
217 | | - :encoding :utf-8}) |
218 | | - (tc/update-columns :datum-zahajeni-prodeje |
219 | | - #(map (fn [date-str] |
220 | | - (when (not-empty date-str) |
221 | | - (parse-csv-date date-str))) |
222 | | - %)) |
223 | | - #_(tc/drop-missing [:edice :datum-zahajeni-prodeje])) |
224 | | - sanitized-colnames-ds (-> summary-raw-ds |
225 | | - (tc/rename-columns :all (fn [col] (if col (keyword (sanitize-column-name-str (name col))) col)))) |
226 | | - |
227 | | - sanitized-rows-ds (-> sanitized-colnames-ds |
228 | | - (tc/update-columns [:titul :podtitul :vazba :barevnost :edice :cenova-kategorie :tloustka] |
229 | | - (fn [column-data] (map sanitize-column-name-str column-data))) |
230 | | - (tc/update-columns [:kategorie-na-e-shopu :kategorie-tema] |
231 | | - (fn [column-data] (map sanitize-category-str column-data))) |
232 | | - (tc/update-columns [:titul] #(map (comp keyword parse-book-name) %))) |
233 | | - |
234 | | - enriched--categories-ds (tc/update-columns sanitized-rows-ds :kategorie-na-e-shopu |
235 | | - (fn [categories] |
236 | | - (map (fn [title category] |
237 | | - (if-let [enriched-category (get category-enrichments title)] |
238 | | - (name enriched-category) |
239 | | - category)) |
240 | | - (sanitized-rows-ds :titul) |
241 | | - categories))) |
242 | | - enriched-ds (tc/add-column enriched--categories-ds |
243 | | - :cesky-autor (fn [ds] (map czech-author? (ds :titul))))] |
244 | | - (-> enriched-ds |
245 | | - (tc/rename-columns {:column-0 :titul})))) |
246 | | - |
247 | 162 | ;; ### One-Hot Encoding Functions |
248 | 163 |
|
249 | | -(defn onehot-encode-by-customers |
| 164 | + |
| 165 | +(defn onehot-encode-by-customers ;; FIXME needs refactor and simplification :) |
250 | 166 | "One-hot encode dataset aggregated by customer. |
251 | 167 | Each customer gets one row with 0/1 values for each book they bought. |
252 | 168 | Used for market basket analysis, customer segmentation, etc." |
|
296 | 212 | 0.0 |
297 | 213 | (double (/ transactions-with-itemset total-transactions))))) |
298 | 214 |
|
299 | | -(defn build-popularity-index |
300 | | - "Creates a popularity index (map in format `{:book :popularity}`) for all items in one-hot encoded dataset" |
301 | | - [dataset] |
302 | | - (let [items (-> dataset (tc/drop-columns :zakaznik) tc/column-names)] |
303 | | - (reduce (fn [acc item] |
304 | | - (assoc acc item (calculate-support dataset [item]))) |
305 | | - {} |
306 | | - items))) |
307 | | - |
308 | | -^:kindly/hide-code |
309 | | -#_(-> (build-popularity-index (onehot-encode-by-customers data-sources/orders)) |
310 | | - (tc/dataset) |
311 | | - (tc/pivot->longer) |
312 | | - (tc/order-by :$value)) |
313 | | - |
314 | 215 |
|
315 | 216 | ^:kindly/hide-code |
316 | 217 | (defn calculate-adaptive-coefficient |
|
0 commit comments