Detect profiles' likely country of residence through chatgpt

In a separate thread
This commit is contained in:
Aleksei Voronov 2023-09-06 08:26:03 +02:00
parent 262cd707b4
commit 46974a9d8c
8 changed files with 258 additions and 7 deletions

101
Cargo.lock generated
View File

@ -29,6 +29,15 @@ dependencies = [
"version_check", "version_check",
] ]
[[package]]
name = "aho-corasick"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "allocator-api2" name = "allocator-api2"
version = "0.2.16" version = "0.2.16"
@ -257,6 +266,20 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chat-gpt-lib-rs"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae8651a0f3f7222ff1e22fd036f8e8cfffa7d6409dd495ddd83d55dc3a3777bf"
dependencies = [
"env_logger",
"log",
"reqwest",
"serde",
"serde_json",
"tokio",
]
[[package]] [[package]]
name = "chrono" name = "chrono"
version = "0.4.28" version = "0.4.28"
@ -482,6 +505,19 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "env_logger"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
dependencies = [
"humantime",
"is-terminal",
"log",
"regex",
"termcolor",
]
[[package]] [[package]]
name = "equivalent" name = "equivalent"
version = "1.0.1" version = "1.0.1"
@ -833,6 +869,12 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]] [[package]]
name = "hyper" name = "hyper"
version = "0.14.27" version = "0.14.27"
@ -929,6 +971,17 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
[[package]]
name = "is-terminal"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
"hermit-abi",
"rustix",
"windows-sys",
]
[[package]] [[package]]
name = "itertools" name = "itertools"
version = "0.10.5" version = "0.10.5"
@ -1183,6 +1236,7 @@ dependencies = [
"async-trait", "async-trait",
"atrium-api", "atrium-api",
"atrium-xrpc", "atrium-xrpc",
"chat-gpt-lib-rs",
"chrono", "chrono",
"ciborium", "ciborium",
"futures", "futures",
@ -1522,6 +1576,35 @@ dependencies = [
"bitflags 1.3.2", "bitflags 1.3.2",
] ]
[[package]]
name = "regex"
version = "1.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
[[package]] [[package]]
name = "reqwest" name = "reqwest"
version = "0.11.20" version = "0.11.20"
@ -2131,6 +2214,15 @@ dependencies = [
"windows-sys", "windows-sys",
] ]
[[package]]
name = "termcolor"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
dependencies = [
"winapi-util",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
version = "1.0.47" version = "1.0.47"
@ -2529,6 +2621,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "winapi-x86_64-pc-windows-gnu" name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0" version = "0.4.0"

View File

@ -10,6 +10,7 @@ anyhow = "1.0.75"
async-trait = "0.1.73" async-trait = "0.1.73"
atrium-api = "0.6.0" atrium-api = "0.6.0"
atrium-xrpc = "0.4.0" atrium-xrpc = "0.4.0"
chat-gpt-lib-rs = "0.2.1"
chrono = "0.4.26" chrono = "0.4.26"
ciborium = "0.2.1" ciborium = "0.2.1"
futures = "0.3.28" futures = "0.3.28"

View File

@ -9,7 +9,7 @@ Heavily WIP. Doesn't work yet at all, but does read the stream of posts as they
- [x] Read stream of posts from Bluesky - [x] Read stream of posts from Bluesky
- [x] Store posts in the database - [x] Store posts in the database
- [x] Store user profiles in the database - [x] Store user profiles in the database
- [ ] Detect the country of residence from profile information - [x] Detect the country of residence from profile information
- [ ] Keep subscription state to not lose messages - [ ] Keep subscription state to not lose messages
- [ ] Serve the feed - [ ] Serve the feed
- [ ] Publish the feed - [ ] Publish the feed

View File

@ -2,7 +2,7 @@ CREATE TABLE IF NOT EXISTS Profile {
id INT GENERATED ALWAYS AS IDENTITY, id INT GENERATED ALWAYS AS IDENTITY,
first_seen_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), first_seen_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
did TEXT UNIQUE, did TEXT UNIQUE,
handle TEXT NULL DEFAULT NULL, has_been_processed BOOLEAN DEFAULT FALSE,
likely_country_of_living varchar(2) NULL DEFAULT NULL likely_country_of_living varchar(2) NULL DEFAULT NULL
} }

38
src/ai.rs Normal file
View File

@ -0,0 +1,38 @@
use anyhow::Result;
use chat_gpt_lib_rs::{ChatGPTClient, ChatInput, Message, Model, Role};
pub type AI = ChatGPTClient;
pub fn make_ai_client() -> AI {
// TODO: Take key from env vars
let api_key = "fake-api-key";
let base_url = "https://api.openai.com";
return ChatGPTClient::new(api_key, base_url);
}
pub async fn infer_country_of_living(
ai: &AI,
display_name: &str,
description: &str,
) -> Result<String> {
let chat_input = ChatInput {
model: Model::Gpt3_5Turbo,
messages: vec![
Message {
role: Role::System,
// TODO: Lol, prompt injection much?
content: "You are a tool that attempts to guess where a person is likely to be from based on their name and short bio. Please respond with two-letter country code only. Use lowercase letters.".to_string(),
},
Message {
role: Role::User,
content: format!("Name: {display_name}\nBio:\n{description}"),
},
],
..Default::default()
};
let response = ai.chat(chat_input).await?;
// TODO: Error handling?
return Ok(response.choices[0].message.content.clone());
}

View File

@ -1,8 +1,10 @@
use anyhow::Result; use anyhow::Result;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use scooby::postgres::{insert_into, Parameters};
use sqlx::postgres::{PgPool, PgPoolOptions}; use scooby::postgres::{insert_into, select, update, Parameters};
use sqlx::Row;
use sqlx::postgres::{PgPool, PgPoolOptions, PgRow};
use sqlx::query; use sqlx::query;
pub type ConnectionPool = PgPool; pub type ConnectionPool = PgPool;
@ -17,7 +19,7 @@ pub struct Post {
pub struct Profile { pub struct Profile {
first_seen_at: DateTime<Utc>, first_seen_at: DateTime<Utc>,
did: String, did: String,
handle: Option<String>, has_been_processed: bool,
likely_country_of_living: Option<String>, likely_country_of_living: Option<String>,
} }
@ -72,3 +74,36 @@ pub async fn insert_profile_if_it_doesnt_exist(db: &ConnectionPool, did: &str) -
.await .await
.map(|result| result.rows_affected() > 0)?) .map(|result| result.rows_affected() > 0)?)
} }
pub async fn fetch_unprocessed_profile_dids(db: &ConnectionPool) -> Result<Vec<String>> {
Ok(query(
&select("did")
.from("Profile")
.where_("has_been_processed = FALSE")
.to_string(),
)
.map(|r: PgRow| r.get(0))
.fetch_all(db)
.await?)
}
pub async fn store_profile_details(
db: &ConnectionPool,
did: &str,
likely_country_of_living: &str,
) -> Result<bool> {
let mut params = Parameters::new();
Ok(query(
&update("Profile")
.set("has_been_processed", "TRUE")
.set("likely_country_of_living", params.next())
.where_(format!("did = {}", params.next()))
.to_string(),
)
.bind(likely_country_of_living)
.bind(did)
.execute(db)
.await
.map(|result| result.rows_affected() > 0)?)
}

View File

@ -1,7 +1,11 @@
mod ai;
mod database; mod database;
mod frames; mod frames;
mod profile_classifying;
mod streaming; mod streaming;
use crate::profile_classifying::classify_unclassified_profiles;
use ai::make_ai_client;
use anyhow::Result; use anyhow::Result;
use async_trait::async_trait; use async_trait::async_trait;
@ -13,13 +17,19 @@ use crate::streaming::{start_processing_operations_with, Operation, OperationPro
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
let db_connection_pool = make_connection_pool().await?; let db_connection_pool = make_connection_pool().await?;
let ai_client = make_ai_client();
// FIXME: This struct shouldn't really exist, but I couldn't find a way to replace // FIXME: This struct shouldn't really exist, but I couldn't find a way to replace
// this whole nonsense with a closure, which is what this whole thing should be in // this whole nonsense with a closure, which is what this whole thing should be in
// first place. // first place.
let post_saver = PostSaver { db_connection_pool }; let post_saver = PostSaver {
db_connection_pool: db_connection_pool.clone(),
};
start_processing_operations_with(post_saver).await?; tokio::try_join!(
start_processing_operations_with(post_saver),
classify_unclassified_profiles(db_connection_pool.clone(), ai_client)
)?;
Ok(()) Ok(())
} }

View File

@ -0,0 +1,66 @@
use anyhow::anyhow;
use std::time::Duration;
use anyhow::Result;
use atrium_api::client::AtpServiceClient;
use atrium_api::xrpc::client::reqwest::ReqwestClient;
use crate::ai::{infer_country_of_living, AI};
use crate::database::{fetch_unprocessed_profile_dids, store_profile_details, ConnectionPool};
#[derive(Debug)]
struct ProfileDetails {
display_name: String,
description: String,
}
pub async fn classify_unclassified_profiles(db: ConnectionPool, ai: AI) -> Result<()> {
loop {
// TODO: Maybe streamify this so that each thing is processed in parallel
// TODO: Also don't just exit this function when an error happens, just wait a minute or so?
let dids = fetch_unprocessed_profile_dids(&db).await?;
if dids.is_empty() {
println!("No profiles to process: waiting 10 seconds");
tokio::time::sleep(Duration::from_secs(10)).await;
} else {
for did in &dids {
fill_in_profile_details(&db, &ai, did).await?;
}
}
}
}
async fn fill_in_profile_details(db: &ConnectionPool, ai: &AI, did: &str) -> Result<()> {
let details = fetch_profile_details(did).await?;
let country = infer_country_of_living(ai, &details.display_name, &details.description).await?;
store_profile_details(db, did, &country).await?;
println!("Stored inferred country of living for {did}: {country}");
Ok(())
}
async fn fetch_profile_details(did: &str) -> Result<ProfileDetails> {
let client = AtpServiceClient::new(ReqwestClient::new("https://bsky.social".into()));
let result = client
.service
.com
.atproto
.repo
.get_record(atrium_api::com::atproto::repo::get_record::Parameters {
collection: "app.bsky.actor.profile".to_owned(),
cid: None,
repo: did.to_owned(),
rkey: "self".to_owned(),
})
.await?;
let profile = match result.value {
atrium_api::records::Record::AppBskyActorProfile(profile) => profile,
_ => return Err(anyhow!("Big bad, no such profile")),
};
Ok(ProfileDetails {
display_name: profile.display_name.unwrap_or_else(String::new),
description: profile.description.unwrap_or_else(String::new),
})
}