I am unable to integrate mlrun to my code?

Question:

I am trying to integrate MLRun to classify my toxic comments using keras code but I am unable to integrate with my code.

can anyone show me where to modify my code to integrate MLRun to my code. I am attaching my code below.

MLRun – link

MLRun is an open source for tracking model and deploy.
I was able to train the model but i was unable to integrate with mlrun and deploy, I found the documentation is hard. please help

import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

from sklearn import preprocessing

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, MaxPool1D, Dropout, Dense, GlobalMaxPooling1D, Embedding, Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

train_data = pd.read_csv('/content/toxic_train.csv')
test_data = pd.read_csv('/content/toxic_test.csv')

# drop unnamed column

train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.head()

def preprocess_text(sen):
    # lower the character
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"s+[a-zA-Z]s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r's+', ' ', sentence)
    
    stops = stopwords.words('english')
    
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
    return sentence

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# preprocess data

train_data['comment_text'] = train_data['comment_text'].apply(lambda x : preprocess_text(x))
test_data['comment_text'] = test_data['comment_text'].apply(lambda x : preprocess_text(x))

# tokenize the data

token = Tokenizer(28164)
token.fit_on_texts(train_data['comment_text'])
text = token.texts_to_sequences(train_data['comment_text'])
text = pad_sequences(text, maxlen=100)

y = train_data['toxic'].values

# split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1, stratify=y)

# build the model

max_features = 28164
embedding_dim = 32

model = Sequential()
model.add(Embedding(max_features, embedding_dim))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()

# compile and train model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=1024, validation_data=(X_test, y_test), epochs=5)

#mlrun
from cloudpickle import dumps
model_data = dumps(model)
context.log_model(key='my_model', body=model_data, model_file='my_model.pkl')

def train_iris(context: MLClientCtx, dataset: DataItem, label_column: str = "labels"):
    raw, labels, header = get_sample(dataset, sample=-1, label=label_column)
    # Basic scikit-learn Iris data-set SVM model
    X_train, X_test, y_train, y_test = train_test_split( raw, labels, test_size=0.2, random_state=42)
    
    context.log_dataset('train_set', 
                        df=pd.concat([X_train, y_train.to_frame()], axis=1),
                        format='csv', index=False, 
                        artifact_path=context.artifact_subpath('data'))

    context.log_dataset('test_set', 
                        df=pd.concat([X_test, y_test.to_frame()], axis=1),
                        format='csv', index=False, 
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))
    
    model = linear_model.LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    
    # Evaluate model results and get the evaluation metrics
    eval_metrics = eval_model_v2(context, X_test, y_test, model)
    
    # Log model
    context.log_model("model",
                      body=dumps(model),
                      artifact_path=context.artifact_subpath("models"),
                      extra_data=eval_metrics, 
                      model_file="model.pkl",
                      metrics=context.results,
                      labels={"class": "sklearn.linear_model.LogisticRegression"})
Asked By: user15957418

||

Answers:

Your code looks ready to be executed with MLRun. Depending on how MLRun is installed, you have to configure your environment to find the API. Take a look here https://docs.mlrun.org/en/latest/install.html

Here is a short script you can use to test.

from mlrun import code_to_function

fn = code_to_function('train_iris', handler='train_iris', kind='job', filename="<YOUR PYTHON FILE PATH>")
# RUN LOCAL
fn.run(project='iris',local=True)

# RUN IN KUBERNETES (if you are running MLRUN in Kubernetes)
fn.run(project='iris',local=False)

Answered By: Marcelo Litovsky