Below is the script I use in the text preprocessing tutorial. You may also read more about Pandas and NLTK.
# If not installed then install libraries
#pip install pandas
#pip install nltk
# Import file
import pandas as pd
df = pd.read_csv(r"C:\your-folder\your-file.csv")
df.info()
df.head(10)
# Lowercase characters
df['titleselftext']=df['titleselftext'].str.lower()
df.head(10)
# Remove whitespaces
def remove_whitespaces(titleselftext):
return " ".join(titleselftext.split())
df['titleselftext']=df['titleselftext'].apply(remove_whitespaces)
df.head(10)
# Tokenize (creates a list of single entities e.g. words)
import nltk
nltk.download('punkt')
from nltk import word_tokenize
df['titleselftext']=df['titleselftext'].apply(lambda X: word_tokenize(X))
df.head(10)
# Remove NLTK and custom stopwords
# Load NLTK stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))
en_stopwords = stopwords.words('english')
# Load custom stopwords
with open(r'C:\your-folder\custom_stopwords.csv', 'r') as f:
custom_stopwords = f.read().strip().split(",")
custom_stopwords[-10:]
# Remove the stopwords
def remove_stopwords(titleselftext):
result = []
for token in titleselftext:
if token not in en_stopwords and token not in custom_stopwords:
result.append(token)
return result
df['titleselftext'] = df['titleselftext'].apply(remove_stopwords)
df.head(10)
# Remove punctuation
from nltk.tokenize import RegexpTokenizer
def remove_punct(titleselftext):
tokenizer = RegexpTokenizer(r"\w+")
lst=tokenizer.tokenize(' '.join(titleselftext))
return lst
df['titleselftext'] = df['titleselftext'].apply(remove_punct)
df.head(10)
# Lemmatize
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag
def lemmatization(titleselftext):
result=[]
wordnet = WordNetLemmatizer()
for token,tag in pos_tag(titleselftext):
pos=tag[0].lower()
if pos not in ['a', 'r', 'n', 'v']:
pos='n'
result.append(wordnet.lemmatize(token,pos))
return result
df['titleselftext']=df['titleselftext'].apply(lemmatization)
df.head(10)
# Convert list to string
df['titleselftext'] = [' '.join(map(str, l)) for l in df['titleselftext']]
df.head(10)
#Remove words 3 characters in length and less
df['titleselftext'] = df.titleselftext.str.replace(r'\b(\w{1,3})\b', '')
# Remove whitespaces
def remove_whitespaces(titleselftext):
return " ".join(titleselftext.split())
df['titleselftext']=df['titleselftext'].apply(remove_whitespaces)
df.head(10)
# Save results to file
df.to_csv(r'C:\your-folder\your-file2.csv', index=False, encoding='utf-8')
If you’re not familiar with getting started with Python then this tutorial will help: